当前位置: 代码迷 >> HTML/CSS >> 透过htmlParser抓取百度的相关内容
  详细解决方案

透过htmlParser抓取百度的相关内容

热度:33138   发布时间:2012-11-01 11:11:33.0
通过htmlParser抓取百度的相关内容

最近这两天我做了个最新电影网的视频网站,主要是从土豆抓取来的.所以内容页就是框架,不便于搜索的抓取。所以我就想加些相关内容,像这样的内容
????? 由于我就是通过百度搜索关键字来填充内容。下面就是我通过htmlParser抓取的代码。

public?class?BaiduResultAction?extends?BaseAction?{

????
public?static?final?Logger?logger?=?Logger
????????????.getLogger(BaiduResultAction.
class);

????
/**?*//**
?????*?组装新闻
?????*?
?????*?
@param?url
?????*?
@return
?????
*/

????
public?String?compNews(String?url)?{
????????String?returnContent?
=?null;
????????
try?{
????????????ParserModel?parserModel?
=?new?ParserModel();
????????????
//table?的抓取标签
????????????String?content?=?"border=\"0\"?cellpadding=\"0\"?cellspacing=\"0\"";
????????????parserModel.setContent(content);
????????????NodeClassNameFilter?contentNodeClassNameFilter?
=?new?NodeClassNameFilter(
????????????????????TableTag.
class,?parserModel);

????????????NodeList?contentList?
=?getAllNodeList(url,
????????????????????contentNodeClassNameFilter);

????????????
//?对table的处理?只取第一个table中的一项记录
????????????
//如果全部抓取内容,则要去掉最后一个break;
????????????for?(int?i?=?1;?i?<?contentList.size();?i++)?{
????????????????
if?(contentList.elementAt(i)?instanceof?TableTag)?{
????????????????????TableTag?tableContent?
=?(TableTag)?contentList
????????????????????????????.elementAt(i);
????????????????????
int?rowCount?=?tableContent.getRowCount();
????????????????????TableRow[]?arrRows?
=?tableContent.getRows();
????????????????????
for?(int?j?=?0;?j?<?arrRows.length;?j++)?{
????????????????????????TableRow?tableRow?
=?arrRows[j];
????????????????????????TableColumn[]?arrColumm?
=?tableRow.getColumns();
????????????????????????
for?(int?k?=?0;?k?<?arrColumm.length;?k++)?{
????????????????????????????String?columContent?
=?arrColumm[k].toHtml();
????????????????????????????
if(columContent?!=?null){
????????????????????????????????String[]?split?
=?columContent.split("<br>");
????????????????????????????????
if(split.length>2)
????????????????????????????????????returnContent?
=?split[1].substring(0,split[1].length()-4);
????????????????????????????}

????????????????????????????
break;
????????????????????????}

????????????????????}

????????????????}

????????????????
break;
????????????}
????????
????????}
?catch?(IllegalArgumentException?e)?{
????????????
//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????????
return?null;
????????}
?catch?(Exception?e)?{
????????????
//?TODO?Auto-generated?catch?block
????????????e.printStackTrace();
????????}

????????
return?returnContent;
????}


????
public?static?void?main(String[]?args)?throws?Exception?{
????????BaiduResultAction?action?
=?new?BaiduResultAction();
????????
//抓取sohu的内容.通过百度
????????String?url?=?"http://www.baidu.com/s?wd=%BA%DA%BF%CD%B5%DB%B9%FAII+11%28112%29++site%3Asohu.com";
????????
//?String?url?=?"http://bbs.hoopchina.com/htm_data/96/0712/274754.html";
????????
//?List<String>?hrefList?=?sinaAction.parseLink(url,?getIndexFilter());
????????/**//*?logger.debug(sinaAction.compNews(url));?*/
????????action.compNews(url);
????}

}

效果如下:http://www.tondou.cn/c/2008-05-12/314146