当前位置: 代码迷 >> 综合 >> python爬取web of science
  详细解决方案

python爬取web of science

热度:104   发布时间:2023-11-23 18:44:39.0

根据作者姓名在某年到某年所发表的文章,对文章的题目,期刊的影响因子进行爬取

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import csv
import re
# from threading import Thread
from multiprocessing import Process
from multiprocessing import Manager
import requests
import time
import xlrd
from bs4 import BeautifulSoup
from lxml import etree
import os
#os.system('rm -r 1.csv')
#os.system('touch 1.csv')
c=0
d=0
e=0
chrome_options=Options()
chrome_options.add_argument('--no-sandbox')
#chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-dev-shm-usage')
url1='http://apps.webofknowledge.com/full_record.do?product=WOS&search_mode=GeneralSearch&qid=4&SID=5ArzJjzffBtmmVcFhzj&page=1&doc=1&cacheurlFromRightClick=no'
url2=url1
tx='Xie, X'
zz='Xie, X'
zs=18282
xm='谢欣'
dz='Chinese Acad Sci'
i=13803
class SpiderMain(object):def __init__(self, sid, kanming):self.hearders = {
    'Origin': 'https://apps.webofknowledge.com','Referer': 'https://apps.webofknowledge.com/UA_GeneralSearch_input.do?product=UA&search_mode=GeneralSearch&SID=R1ZsJrXOFAcTqsL6uqh&preferencesSaved=','User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36",'Content-Type': 'application/x-www-form-urlencoded'}self.form_data = {
    'fieldCount': 1,'action': 'search','product': 'WOS','search_mode': 'GeneralSearch','SID': sid,'max_field_count': 25,'formUpdated': 'true','value(input1)': kanming,'value(select1)': 'AU','value(hidInput1)': '','limitStatus': 'collapsed','ss_lemmatization': 'On','ss_spellchecking': 'Suggest','SinceLastVisit_UTC': '','SinceLastVisit_DATE': '','range': 'CUSTOM','period': 'Year Range','startYear': '2012','endYear': '2021','update_back2search_link_param': 'yes','ssStatus': 'display:none','ss_showsuggestions': 'ON','ss_query_language': 'auto','ss_numDefaultGeneralSearchFields': 1,'rs_sort_by': 'PY.D;LD.D;SO.A;VL.D;PG.A;AU.A'}self.form_data2 = {
    'product': 'WOS','prev_search_mode': 'CombineSearches','search_mode': 'CombineSearches','SID': sid,'action': 'remove','goToPageLoc': 'SearchHistoryTableBanner','currUrl': 'https://apps.webofknowledge.com/WOS_CombineSearches_input.do?SID=' + sid + '&product=WOS&search_mode=CombineSearches','x': 48,'y': 9,'dSet': 1}def craw(self, root_url,i):try:s = requests.Session()r = s.post(root_url, data=self.form_data, headers=self.hearders)#print(r)r.encoding = r.apparent_encoding#print(r.text)re_text = r'<span class="smallV110">.*?value>'re_text1 = r'<span class="smallV110">[\s\S]*?value>'match_list = re.findall(re_text1, r.text)#print(match_list[0])soup = BeautifulSoup(match_list[0], 'html.parser')prefix = "http://apps.webofknowledge.com"#print(prefix+soup.a['href'])return prefix+soup.a['href']tree = etree.HTML(r.text)#print(tree)cited = tree.xpath("//div[@class='search-results-data-cite']/a/text()")download = tree.xpath(".//div[@class='alum_text']/span/text()")flag = 0print(r.url)#print(i,cited, download,r.url)flag=0#return cited, download, flagexcept Exception as e:passdef delete_history(self):murl = 'https://apps.webofknowledge.com/WOS_CombineSearches.do's = requests.Session()s.post(murl, data=self.form_data2, headers=self.hearders)
root_url = 'https://apps.webofknowledge.com/UA_GeneralSearch.do'
class Html_data:def __init__(self, soup):self.title = ''self.author = ''self.abstract = ''self.keywords = ''self.author_data = ''self.data = ''self.JCR_quartile=''self.Impact_Factor_table=''self.FR_field=''self.year=''self.soup = soupself.flag=0self.flag1=0global etry:self.title = soup.find(attrs={
    'class':'title'}).text.replace('\n','')  soup1=soup.find_all('td',class_="JCR_quartile")#print(len(soup1)) r=Noneglobal dif len(soup1)>0: r=re.search('>(.*)<', str(soup1[0]))#print(str(soup.find_all('td',class_="JCR_quartile")))global cif r==None:d=d+1c=1else:c=0print(r.group(1))self.JCR_quartile=r.group(1)r=re.search('<td> (.*) </td>', str(soup.find_all('table',class_="Impact_Factor_table")))print(r)if r==None:c=1else:c=0self.Impact_Factor_table=r.group(1)    try:self.data = soup.find(attrs={
    'class':'block-record-info block-record-info-source'}).textdata1=self.data.split('\n')data2=data1[data1.index('Published:')+1]self.year=data2[-4:]#print()except:passitems = soup.find_all(attrs={
    'class':'block-record-info'})for item in items:if len(item.attrs['class']) > 1:continueif 'By:' in item.text:item1=item.find_all('p',class_="FR_field")self.author = item1[0].text.replace('By:','').replace('\n','').replace(' ','').replace(' ]',']')#self.author = item.text.replace('By:','').replace('\n','').replace(' ','').replace(' ]',']') continueelif 'Abstract' in item.text:self.abstract = item.textcontinueelif 'Keywords' in item.text:self.keywords = item.textcontinueelif 'Author Information' in item.text:item2=item.find_all('table',class_="FR_table_noborders")#print(item.find_all('p',class_="FR_field")[1].text)if tx in str(item.find_all('p',class_="FR_field")):########################3self.flag=1#print(self.flag)#if 'Tianjin Univ, Sch Mat Sci & Engn, Tianjin 300072, Peoples R China' in item2[len(item2)-1].text:#print(item2[len(item2)-1].text)try:if dz in item2[len(item2)-1].text:########################self.flag1=1except:self.author_data = item.text continuee=0except:browser = webdriver.Chrome('/usr/lib/chromium-browser/chromedriver',options=chrome_options)browser.get('https://www.webofscience.com/wos/alldb/basic-search')#print(browser.page_source)soup = BeautifulSoup(browser.page_source,'lxml')r=re.search('"sid":"(.*)"};', str(soup))print(r[1])browser.quit()obj_spider = SpiderMain(r[1], zz)global url1url1=obj_spider.craw(root_url,0)global ii=i-1e=e+1if e>=2:i=i+1
import random
f=open('User-Agent.txt',"rb")
user_agents =f.readlines()
f.close()
if i==1:os.system('rm -r 1.csv')os.system('touch 1.csv')
while True:url=url1try:url=url.replace('doc=1','doc='+str(i))except:url=url2url=url.replace('doc=1','doc='+str(i))print(url)respon=Nonec=0while respon==None:user_agent1=random.choice(user_agents)user_agent=str(user_agent1)print(user_agent[2:-5])headers = {
    'User-Agent':user_agent[2:-5]}try:respon = requests.get(url, headers =headers,timeout=60)except:user_agents.remove(user_agent1)print(len(user_agents))c=c+1if c==3:url=url1i=i+1url=url.replace('doc=1','doc='+str(i))print(url)c=0#print(respon)#print(22222222222)if respon:html = respon.textsoup = BeautifulSoup(html,'lxml')html_data = Html_data(soup)#print(soup) # 获取对象信息title = html_data.title authors = html_data.authorabstract = html_data.abstractauthors_data = html_data.author_datadata = html_data.datakeywords = html_data.keywords   year=html_data.year authors1=authors.split(';')b=0for a in authors1:b=b+1#if 'Su, Yan Qing' in a or 'Su, Yanqing' in a or 'Su Yanqing' in a or 'Su, Yan-Qing' in a:#############################if zz in a :if html_data.flag1==1:if html_data.flag==1:cengci='通讯'else:cengci=str(b)csv_data =[xm,title, year,cengci,html_data.Impact_Factor_table,html_data.JCR_quartile]##########################c=0f=open('1.csv', encoding='gbk', mode='a', newline='')csv_writer = csv.writer(f)csv_writer.writerow(csv_data)f.close()print(csv_data)breakif i==zs:breaki=i+1