from selenium import webdriver
from lxml import etree
import rechromeOptions = webdriver.ChromeOptions()
driver_path = r'D:\chromedriver\chromedriver.exe'
chromeOptions.add_argument("--proxy-server=http://代理ip:端口") # 网上搜一下大把免费代理ip,66代理不错
driver = webdriver.Chrome(executable_path=driver_path, chrome_options = chromeOptions)
driver.get('https://www.lagou.com/jobs/list_java?labelWords=&fromSearch=true&suginput=')html = etree.HTML(driver.page_source)
items = html.xpath("//ul[@class='item_con_list']/li")
nodes = html.xpath("//div[@class='li_b_l']/text()")
asks = []
for node in nodes:if re.sub(r'[ \n]', '', node).__len__() != 0:asks.append(re.sub(r'[ \n]', '', node))for index, item in enumerate(items):print('公司:%s\t\t职位:%s' % (item.get('data-company'), item.get('data-positionname')))print('工作年限/学历:%s\t\t薪资:%s' % (asks[index], item.get('data-salary')))print('##' * 50)
看了一些博客解决反爬,把数据清洗、存储也给写了。
值得借鉴的也就反爬的十几行,结果来了上百行代码,如果读者用的是elasticsearch,写的是mysql\cvs是不是浪费读者时间?藕的话就直接关了喔