package com.safetys.crawler.util; import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Set; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.filters.TagNameFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.tags.TableTag; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; import com.safetys.framework.exception.ApplicationAccessException; /** * 整合百度、谷歌搜索数据 * @author zhaozhi3758 * date:2011-04-19 */ public class Crawler { private final static String splitStr="zzc@cheng"; private String encoding="gbk"; //解析页面编码 public String searchMode;//指定搜索方式 keyword 按关键字搜索,specifyUrl 按指定url搜索 public String baiduUrl; //百度搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}" public String googleUrl; //google 搜索url,按照设定的搜索链接模版,需包含${keyword}:关键字/${searchNum}:搜索数量 "http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai=" public String keyword; //搜索关键字 public int searchNum = 0;//搜索数量 public String specifyUrl; //按指定的url 搜索 /** * 抓取百度搜索结果页面 */ public List<String> crawlerBaidu(){ Parser myParser = new Parser(); try { myParser.setURL(getBaiduUrl()); myParser.setEncoding(myParser.getEncoding()); } catch (ParserException e1) { e1.printStackTrace(); } NodeList nodeList = null; NodeFilter tableFilter = new NodeClassFilter(TableTag.class); OrFilter lastFilter = new OrFilter(); lastFilter.setPredicates(new NodeFilter[] { tableFilter }); List<String> result = new ArrayList<String>(); try { nodeList = myParser.parse(lastFilter); for (int i = 0; i <= nodeList.size(); i++) { if (nodeList.elementAt(i) instanceof TableTag) { TableTag tag = (TableTag) nodeList.elementAt(i); if(tag.getAttribute("id")!=null){ result.addAll(getBaiduLink(tag.getChildrenHTML())); } } } } catch (ParserException e) { e.printStackTrace(); } return result; } private List<String> getBaiduLink(String s){ Parser myParser; NodeList nodeList = null; myParser = Parser.createParser(s,encoding); List<String> result = new ArrayList<String>(); try { //设置解析编码格式 nodeList =myParser.parse (new NodeClassFilter(LinkTag.class)) ; // 使用 NodeClassFilter if (nodeList!=null && nodeList.size () > 0) { // 循环遍历每个Url 节点 for (int l = 0; l < nodeList.size () ; l ++) { String urlLink= ((LinkTag) nodeList.elementAt (l)) .extractLink () ; String LinkName = ((LinkTag) nodeList.elementAt (l)).getLinkText () ; if(!LinkName.equals("百度快照") && urlLink.indexOf("baidu")==-1 && urlLink.indexOf("http") == 0){ System.out.println("baidu--->"+LinkName + splitStr + urlLink); result.add(LinkName + splitStr + urlLink); } } } } catch (ParserException e) { e.printStackTrace () ; } return result; } /** * 抓取谷歌搜索结果页面的指定范围的链接 */ private List<String> crawlerGoogle() { String htmlstr = getUrlHtmlByHttpClient(getGoogleUrl()); List<String> result = new ArrayList<String>(); try { Parser parser = Parser.createParser(htmlstr, encoding); // 创建TagNameFilter实例 TagNameFilter filter = new TagNameFilter("A"); // 筛选出所有A标签节点 NodeList nodes = parser.extractAllNodesThatMatch(filter); if (nodes != null) { for (int i = 0; i < nodes.size(); i++) { LinkTag tag = (LinkTag) nodes.elementAt(i); if (tag.getLink().indexOf ("google") ==-1 && tag.getLink().indexOf ("http") == 0 ){ System.out.println("google--->"+tag.getLinkText() +splitStr+ tag.getLink()); result.add(tag.getLinkText() +splitStr+ tag.getLink()); } } } } catch (Exception e) { e.printStackTrace(); } return result; } /** * 模拟客户端访问获取搜索结果页面 * @param url * @return */ private String getUrlHtmlByHttpClient(String url) { String searchHtml = null; HttpClient httpClient = new HttpClient(); httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000); GetMethod getMethod = new GetMethod(url); getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000); getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER,new DefaultHttpMethodRetryHandler()); try { int statusCode = httpClient.executeMethod(getMethod); if (statusCode != HttpStatus.SC_OK) { System.err.println("Method failed: " + getMethod.getStatusLine()); } InputStream bodyIs = getMethod.getResponseBodyAsStream();// //System.out.println("get reoponse body stream:" + bodyIs); //如果中文乱码 修改字符集 BufferedReader br = new BufferedReader( new InputStreamReader(bodyIs,encoding)); //BufferedReader br = new BufferedReader( //new InputStreamReader(bodyIs)); StringBuffer sb = new StringBuffer(); String line = null; while ((line = br.readLine()) != null) { sb.append(line); } searchHtml = sb.toString(); return searchHtml; } catch (HttpException e) { System.out.println("Please check your http address!"); e.printStackTrace(); return null; } catch (IOException e) { e.printStackTrace(); return null; } finally { getMethod.releaseConnection(); } } /** * 按关键字抓取的统一入口 */ public List<String> crawler() throws ApplicationAccessException{ if(null == searchMode || searchMode.equals("")) throw new ApplicationAccessException("searchMode is null"); Set<String> set = new HashSet<String>(); List<String> list = new ArrayList<String>(); if(searchMode.equals("specifyUrl")){ //按指定url搜索 } else if(searchMode.equals("keyword")){ //按关键字搜索 set.addAll(crawlerBaidu()); set.addAll(crawlerGoogle()); } list.addAll(set); return list; } public String getBaiduUrl() { return baiduUrl.replace("${keyword}", getKeyword()).replace("${searchNum}", ""+(searchNum/2)); } public void setBaiduUrl(String baiduUrl) { this.baiduUrl = baiduUrl; } public String getEncoding() { return encoding; } public void setEncoding(String encoding) { this.encoding = encoding; } public String getGoogleUrl() { return googleUrl.replace("${keyword}",getKeyword()).replace("${searchNum}",""+(searchNum/2)); } public void setGoogleUrl(String googleUrl) { this.googleUrl = googleUrl; } public String getKeyword() { String key =""; try { key = URLEncoder.encode(keyword,encoding); } catch (UnsupportedEncodingException e) { e.printStackTrace(); } return key; } public void setKeyword(String keyword) { this.keyword = keyword; } public String getSearchMode() { return searchMode; } public void setSearchMode(String searchMode) { this.searchMode = searchMode; } public int getSearchNum() { return searchNum; } public void setSearchNum(int searchNum) { this.searchNum = searchNum; } public String getSpecifyUrl() { return specifyUrl; } public void setSpecifyUrl(String specifyUrl) { this.specifyUrl = specifyUrl; } public static void main(String[] args) throws ApplicationAccessException { Crawler cl = new Crawler(); cl.setEncoding("gbk"); cl.setSearchNum(10); cl.setKeyword("面包"); cl.setSearchMode("keyword"); cl.setBaiduUrl("http://www.baidu.com/s?rn=${searchNum}&wd=${keyword}"); cl.setGoogleUrl("http://www.google.com.hk/search?hl=zh-CN&source=hp&q=${keyword}&num=${searchNum}&aq=f&aqi=&aql=&oq=&gs_rfai="); System.out.println("=====>"+cl.getBaiduUrl()); System.out.println("=====>"+cl.getGoogleUrl()); System.out.println(cl.crawler()); } }
详细解决方案
HtmlParser 解析搜寻页面
热度:772 发布时间:2012-12-22 12:05:06.0
相关解决方案
- python模块引见- HTMLParser 简单的HTML和XHTML解析器
- 利用python脚本抓取AC的代码[爬虫+HTMLParser+handle_entityref+正则表达式+模拟登陆+资料操作]
- [转][htmlparser]htmlparser应用例子(全)
- 应用 HttpClient 和 HtmlParser 实现简易爬虫
- htmlparser 除了html标签体(获取body,title纯文本)
- HtmlParser 解析搜寻页面
- 运用 HttpClient 和 HtmlParser 实现简易爬虫
- [转]org.htmlparser.util.ParserException: Error in opening a connection to *
- Python抓取页面中超链接(URL)的三中方法比较(HTMLParser、pyquery、正则表达式)
- 应用 HttpClient 和 HtmlParser 实现简易爬
- HTMLParser
- htmlparser 获取页面婚配链接
- org.htmlparser.util.EncodingChangeException: character
- htmlparser 抓不到网页的内容。该如何解决
- org.htmlparser.util.ParserException: reset stream failed
- 哪位高手知道org.htmlparser.StringNode在哪个版本的jar包
- 【HtmlParser】提取网页的meta信息解决方法
- 使用ASIHTTPRequest 编译提示找不到libxml/HTMLparser.h的解决方法
- org.htmlparser.util.ParserException: Error in opening a connection to ***
- coursera-dl 报错 AttributeError (‘HTMLParser’ object has no attribute ‘unescape’)