jar包下载地址:http://jsoup.org/download
<%@ page import="little.util.Utils" %> <%@ page import="org.jsoup.nodes.Element" %> <%@ page import="org.jsoup.Jsoup" %> <%@ page import="org.jsoup.nodes.Document" %> <%@ page import="org.jsoup.select.Elements" %> <%@ page import="java.util.Map" %> <%@ page import="java.util.TreeMap" %> <%@ page import="java.util.Iterator" %> <%@ page contentType="text/html;charset=GBK" language="java" %> <% String url = "http://www.yi-see.com/art_10630_3518.html"; //设置代理访问 String html = Utils.readHtml4Get(url, true); Document doc = Jsoup.parse(html); //查找包含<a href="read_">内容 Elements clicks = doc.select("a[href*=read_]"); Map<Integer,String> map = new TreeMap<Integer,String>(); long startTime = System.currentTimeMillis(); //使用jsoup解析页面-页码与地址放在treemap中排序 for(Element et : clicks){ map.put(Utils.regexNum(et.text()),et.attr("href")); } String str = ""; StringBuffer buffer = new StringBuffer(); for(Iterator it = map.entrySet().iterator();it.hasNext();){ Map.Entry entry = (Map.Entry)it.next(); //页码,升序顺序; Integer key = (Integer)entry.getKey(); //链接url String value = (String)entry.getValue(); System.out.println("key="+key+">>>value="+value); //拼出绝对链接 url = "http://www.yi-see.com/"+value; html = Utils.readHtml4Get(url, true); doc = Jsoup.parse(html); //定位到<td class="ART">标签,就是文章内容 clicks = doc.select("td[class=ART]"); for(Element et : clicks){ str = et.toString(); break; } //过滤尾部 str = str.replace(" <div class=\"FL\">\n" + " 请按 Ctrl+D 将本页加入书签\n" + " </div><a href=\"feedback.php\" target=\"_blank\">提意见或您需要哪些图书的全集整理?</a><a><br /></a><a href=\"feedback.php?GJB=举报\" target=\"_blank\">举报</a><a><span class=\"mr\">【网站提示】 读者如发现作品内容与法律抵触之处,请向本站举报。 非常感谢您对易读的支持!</span><br /> </a>",""); buffer.append("++++++++++++++++ <br />"); buffer.append(" 第"+key+"页: <br />"); buffer.append("++++++++++++++++ <br />"); buffer.append(str); } out.print(buffer.toString()); System.out.println("共用时:" + (double)(((System.currentTimeMillis() - startTime) / 1000) / 60) + " 分钟"); %>
Utils方法:
//\\u4E00-\\u9FA5\\uF900-\\uFA2D是指汉字的Unicode编码范围 private static final Pattern REGEX_NUM = Pattern.compile("(^[\\u4E00-\\u9FA5\\uF900-\\uFA2D])(\\d+)([\\u4E00-\\u9FA5\\uF900-\\uFA2D])?"); public static int regexNum(String str){ Matcher matcher = REGEX_NUM.matcher(str); if(matcher.matches()){ return Integer.parseInt(matcher.group(2)); } return 0; }