最近这两天我做了个最新电影网的视频网站,主要是从土豆抓取来的.所以内容页就是框架,不便于搜索的抓取。所以我就想加些相关内容,像这样的内容
????? 由于我就是通过百度搜索关键字来填充内容。下面就是我通过htmlParser抓取的代码。
public?class?BaiduResultAction?extends?BaseAction?{
????public?static?final?Logger?logger?=?Logger
????????????.getLogger(BaiduResultAction.class);
????/**
?????*?组装新闻
?????*?
?????*?@param?url
?????*?@return
?????*/
????public?String?compNews(String?url)?{
????????String?returnContent?=?null;
????????try?{
????????????ParserModel?parserModel?=?new?ParserModel();
????????????//table?的抓取标签
????????????String?content?=?"border=\"0\"?cellpadding=\"0\"?cellspacing=\"0\"";
????????????parserModel.setContent(content);
????????????NodeClassNameFilter?contentNodeClassNameFilter?=?new?NodeClassNameFilter(
????????????????????TableTag.class,?parserModel);
????????????NodeList?contentList?=?getAllNodeList(url,
????????????????????contentNodeClassNameFilter);
????????????//?对table的处理?只取第一个table中的一项记录
????????????//如果全部抓取内容,则要去掉最后一个break;
????????????for?(int?i?=?1;?i?<?contentList.size();?i++)?{
????????????????if?(contentList.elementAt(i)?instanceof?TableTag)?{
????????????????????TableTag?tableContent?=?(TableTag)?contentList
????????????????????????????.elementAt(i);
????????????????????int?rowCount?=?tableContent.getRowCount();
????????????????????TableRow[]?arrRows?=?tableContent.getRows();
????????????????????for?(int?j?=?0;?j?<?arrRows.length;?j++)?{
????????????????????????TableRow?tableRow?=?arrRows[j];
????????????????????????TableColumn[]?arrColumm?=?tableRow.getColumns();
????????????????????????for?(int?k?=?0;?k?<?arrColumm.length;?k++)?{
????????????????????????????String?columContent?=?arrColumm[k].toHtml();
????????????????????????????if(columContent?!=?null){
????????????????????????????????String[]?split?=?columContent.split("<br>");
????????????????????????????????if(split.length>2)
????????????????????????????????????returnContent?=?split[1].substring(0,split[1].length()-4);
????????????????????????????}
????????????????????????????break;
????????????????????????}
????????????????????}
????????????????}
????????????????break;
????????????}????????
????????}?catch?(IllegalArgumentException?e)?{
????????????//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????????return?null;
????????}?catch?(Exception?e)?{
????????????//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????}
????????return?returnContent;
????}
????public?static?void?main(String[]?args)?throws?Exception?{
????????BaiduResultAction?action?=?new?BaiduResultAction();
????????//抓取sohu的内容.通过百度
????????String?url?=?"http://www.baidu.com/s?wd=%BA%DA%BF%CD%B5%DB%B9%FAII+11%28112%29++site%3Asohu.com";
????????//?String?url?=?"http://bbs.hoopchina.com/htm_data/96/0712/274754.html";
????????//?List<String>?hrefList?=?sinaAction.parseLink(url,?getIndexFilter());
????????/*?logger.debug(sinaAction.compNews(url));?*/
????????action.compNews(url);
????}
}
????public?static?final?Logger?logger?=?Logger
????????????.getLogger(BaiduResultAction.class);
????/**
?????*?组装新闻
?????*?
?????*?@param?url
?????*?@return
?????*/
????public?String?compNews(String?url)?{
????????String?returnContent?=?null;
????????try?{
????????????ParserModel?parserModel?=?new?ParserModel();
????????????//table?的抓取标签
????????????String?content?=?"border=\"0\"?cellpadding=\"0\"?cellspacing=\"0\"";
????????????parserModel.setContent(content);
????????????NodeClassNameFilter?contentNodeClassNameFilter?=?new?NodeClassNameFilter(
????????????????????TableTag.class,?parserModel);
????????????NodeList?contentList?=?getAllNodeList(url,
????????????????????contentNodeClassNameFilter);
????????????//?对table的处理?只取第一个table中的一项记录
????????????//如果全部抓取内容,则要去掉最后一个break;
????????????for?(int?i?=?1;?i?<?contentList.size();?i++)?{
????????????????if?(contentList.elementAt(i)?instanceof?TableTag)?{
????????????????????TableTag?tableContent?=?(TableTag)?contentList
????????????????????????????.elementAt(i);
????????????????????int?rowCount?=?tableContent.getRowCount();
????????????????????TableRow[]?arrRows?=?tableContent.getRows();
????????????????????for?(int?j?=?0;?j?<?arrRows.length;?j++)?{
????????????????????????TableRow?tableRow?=?arrRows[j];
????????????????????????TableColumn[]?arrColumm?=?tableRow.getColumns();
????????????????????????for?(int?k?=?0;?k?<?arrColumm.length;?k++)?{
????????????????????????????String?columContent?=?arrColumm[k].toHtml();
????????????????????????????if(columContent?!=?null){
????????????????????????????????String[]?split?=?columContent.split("<br>");
????????????????????????????????if(split.length>2)
????????????????????????????????????returnContent?=?split[1].substring(0,split[1].length()-4);
????????????????????????????}
????????????????????????????break;
????????????????????????}
????????????????????}
????????????????}
????????????????break;
????????????}????????
????????}?catch?(IllegalArgumentException?e)?{
????????????//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????????return?null;
????????}?catch?(Exception?e)?{
????????????//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????}
????????return?returnContent;
????}
????public?static?void?main(String[]?args)?throws?Exception?{
????????BaiduResultAction?action?=?new?BaiduResultAction();
????????//抓取sohu的内容.通过百度
????????String?url?=?"http://www.baidu.com/s?wd=%BA%DA%BF%CD%B5%DB%B9%FAII+11%28112%29++site%3Asohu.com";
????????//?String?url?=?"http://bbs.hoopchina.com/htm_data/96/0712/274754.html";
????????//?List<String>?hrefList?=?sinaAction.parseLink(url,?getIndexFilter());
????????/*?logger.debug(sinaAction.compNews(url));?*/
????????action.compNews(url);
????}
}
效果如下:http://www.tondou.cn/c/2008-05-12/314146