jsoup 是一款 Java 的HTML 解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于JQuery的操作方法来取出和操 作数据。请参考: http://jsoup.org/
?
?
?? ? jsoup的主要功能如下:
?
?? ? ?从一个URL,文件或字符串中解析HTML;
?? ? ?使用DOM或CSS选择器来查找、取出数据;
?? ? ?可操作HTML元素、属性、文本;
?? ? ?jsoup是基于MIT协议发布的,可放心使用于商业项目。
?? ??下载和安装:
?? ? ?maven安装方法:
?? ? ? 把下面放入pom.xml下
?? ? ? ?<dependency>
?? ? ? ? ?<!-- jsoup HTML parser library @ http://jsoup.org/ -->
?? ? ? ? <groupId>org.jsoup</groupId>
?? ? ? ? <artifactId>jsoup</artifactId>
?? ? ? ? <version>1.5.2</version>
?? ? ? ?</dependency>
?? ? ?用jsoup解析html的方法如下:
?? ? ? ?解析url html方法
?
Document
doc
=
Jsoup
.
connect
(
"http://example.com"
)
?
.
data
(
"query"
,
"Java"
)
?
.
userAgent
(
"Mozilla"
)
?
.
cookie
(
"auth"
,
"token"
)
?
.
timeout
(
3000
)
?
.
post
();
?
?? ? ?从文件中解析的方法:
?
?
File
input
=
new
File
(
"/tmp/input.html"
);
Document
doc
=
Jsoup
.
parse
(
input
,
"UTF-8"
,
"http://example.com/"
);
?
?
??类试js ?jsoup提供下面方法:
?
-
getElementById(String id) ?用id获得元素
-
getElementsByTag(String tag) ?用标签获得元素
-
getElementsByClass(String className) ?用class获得元素
-
getElementsByAttribute(String key) ??用属性获得元素
?
?同时还提供下面的方法提供获取兄弟节点:
?siblingElements()
,?firstElementSibling()
,?lastElementSibling()
;nextElementSibling()
,?previousElementSibling()
用下面方法获得元素的数据:
?
-
attr(String key)
??获得元素的数据 -
attr(String key, String value)
?t设置元素数据 -
attributes()
?获 得所以属性 -
id()
,?className() ?
?classNames() ?获 得id class得值
-
text() 获 得文本值
- ?
text(String value)
?设置文本值 -
html()
?获 取html? - ?
html(String value) 设置html
-
outerHtml()
?获 得内部html -
data() 获 得数据内容
-
tag()
??获 得tag 和?tagName() ?获 得tagname
?
操作html提供了下面方法:
?
-
append(String html)
,?prepend(String html)
-
appendText(String text)
,?prependText(String text)
-
appendElement(String tagName)
,?prependElement(String tagName)
-
html(String value)
File
input
=
new
File
(
"/tmp/input.html"
);
Document
doc
=
Jsoup
.
parse
(
input
,
"UTF-8"
,
"http://example.com/"
);
Elements
links
=
doc
.
select
(
"a[href]"
);
// a with href
Elements
pngs
=
doc
.
select
(
"img[src$=.png]"
);
?
// img with src ending .png
Element
masthead
=
doc
.
select
(
"div.masthead"
).
first
();
?
// div with class=masthead
Elements
resultLinks
=
doc
.
select
(
"h3.r > a"
);
// direct a after h3
?
支持的操作有下面这些:
?
-
tagname 操作tag
-
ns|tag ns或tag
-
#id ?用id获得元素?
-
.class 用class获得元素
-
[attribute] 属性获得元素
-
[^attr]
: 以attr开头的属性 -
[attr=value] 属性值为
value -
[attr^=value]
,?[attr$=value]
,?[attr*=value]?
-
[attr~=regex] 正则
-
*
:所以的标签
选择组合
-
el#id el和id定位
-
el.class e1和class定位
-
el[attr]?
e1和属性定位 -
ancestor child?
ancestor下面的 child
?
?
- public ?? void ?parse(String?urlStr)?{??
- ????//?返回结果初始化。 ??
- ??
- ????Document?doc?=?null ;??
- ????try ?{??
- ????????doc?=?Jsoup??
- ????????????????.connect(urlStr)??
- ????????????????.userAgent(??
- ????????????????????????"Mozilla/5.0?(Windows;?U;?Windows?NT?5.1;?zh-CN;?rv:1.9.2.15)" )? //?设置User-Agent ??
- ????????????????.timeout(5000 )? //?设置连接超 时时间 ??
- ????????????????.get();??
- ????}?catch ?(MalformedURLException?e)?{??
- ????????log.error(?e);??
- ????????return ?;??
- ????}?catch ?(IOException?e)?{??
- ????????if ?(e? instanceof ?SocketTimeoutException)?{??
- ????????????log.error(?e);??
- ???????????????????????????????return ?;??
- ????????}??
- ????????if (e? instanceof ?UnknownHostException){??
- ????????????log.error(e);??
- ????????????return ?;??
- ????????}??
- ????????log.error(?e);??
- ????????return ?;??
- ????}??
- ????system.out.println(doc.title());??
- ????Element?head?=?doc.head();??
- ????Elements?metas?=?head.select("meta" );??
- ????for ?(Element?meta?:?metas)?{??
- ????????String?content?=?meta.attr("content" );??
- ????????if ?( "content-type" .equalsIgnoreCase(meta.attr( "http-equiv" ))??
- ????????????????&&?!StringUtils.startsWith(content,?"text/html" ))?{??
- ????????????log.debug(?urlStr);??
- ????????????return ?;??
- ????????}??
- ????????if ?( "description" .equalsIgnoreCase(meta.attr( "name" )))?{??
- ????????????system.out.println(meta.attr("content" ));??
- ????????}??
- ????}??
- ????Element?body?=?doc.body();??
- ????for ?(Element?img?:?body.getElementsByTag( "img" ))?{??
- ????????String?imageUrl?=?img.attr("abs:src" ); //获 得绝对路径 ??
- ????????for ?(String?suffix?:?IMAGE_TYPE_ARRAY)?{??
- ????????????if (imageUrl.indexOf( "?" )> 0 ){??
- ????????????????imageUrl=imageUrl.substring(0 ,imageUrl.indexOf( "?" ));??
- ????????????}??
- ????????????if ?(StringUtils.endsWithIgnoreCase(imageUrl,?suffix))?{??
- ????????????????imgSrcs.add(imageUrl);??
- ????????????????break ;??
- ????????????}??
- ????????}??
- ????}??
- }??
public void parse(String urlStr) { // 返回结果初始化。 Document doc = null; try { doc = Jsoup .connect(urlStr) .userAgent( "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.15)") // 设置User-Agent .timeout(5000) // 设置连接超时时间 .get(); } catch (MalformedURLException e) { log.error( e); return ; } catch (IOException e) { if (e instanceof SocketTimeoutException) { log.error( e); return ; } if(e instanceof UnknownHostException){ log.error(e); return ; } log.error( e); return ; } system.out.println(doc.title()); Element head = doc.head(); Elements metas = head.select("meta"); for (Element meta : metas) { String content = meta.attr("content"); if ("content-type".equalsIgnoreCase(meta.attr("http-equiv")) && !StringUtils.startsWith(content, "text/html")) { log.debug( urlStr); return ; } if ("description".equalsIgnoreCase(meta.attr("name"))) { system.out.println(meta.attr("content")); } } Element body = doc.body(); for (Element img : body.getElementsByTag("img")) { String imageUrl = img.attr("abs:src");//获得绝对路径 for (String suffix : IMAGE_TYPE_ARRAY) { if(imageUrl.indexOf("?")>0){ imageUrl=imageUrl.substring(0,imageUrl.indexOf("?")); } if (StringUtils.endsWithIgnoreCase(imageUrl, suffix)) { imgSrcs.add(imageUrl); break; } } } }?
这里重点要 提的是怎么获得图片或链接的决定地址:
??如上获得绝对地址的方法String imageUrl = img.attr("abs:src");//获得绝对路径
,前面添加abs:jsoup就会获得决定地址;
想知道原因,咱们查看下源 码,如下:
- //该方面是先从map中找看是否有该属性key,如果有直接返回,如果没有检查是否 ??
- //以abs:开头 ??
- ??public ?String?attr(String?attributeKey)?{??
- ????????Validate.notNull(attributeKey);??
- ??
- ????????if ?(hasAttr(attributeKey))??
- ????????????return ?attributes.get(attributeKey);??
- ????????else ? if ?(attributeKey.toLowerCase().startsWith( "abs:" ))??
- ????????????return ?absUrl(attributeKey.substring( "abs:" .length()));??
- ????????else ? return ? "" ;??
- ????}??
//该方面是先从map中找看是否有该属性key,如果有直接返回,如果没有检查是否 //以abs:开头 public String attr(String attributeKey) { Validate.notNull(attributeKey); if (hasAttr(attributeKey)) return attributes.get(attributeKey); else if (attributeKey.toLowerCase().startsWith("abs:")) return absUrl(attributeKey.substring("abs:".length())); else return ""; }
?
?接着查看absUrl方法:
?
- ??????
- ??
- ??/** ?
- ?????*?Get?an?absolute?URL?from?a?URL?attribute?that?may?be?relative?(i.e.?an?<code><a?href></code>?or ?
- ?????*?<code><img?src></code>). ?
- ?????*?<p/> ?
- ?????*?E.g.:?<code>String?absUrl?=?linkEl.absUrl("href");</code> ?
- ?????*?<p/> ?
- ?????*?If?the?attribute?value?is?already?absolute?(i.e.?it?starts?with?a?protocol,?like ?
- ?????*?<code>http://</code>?or?<code>https://</code>?etc),?and?it?successfully?parses?as?a?URL,?the?attribute?is ?
- ?????*?returned?directly.?Otherwise,?it?is?treated?as?a?URL?relative?to?the?element's?{@link?#baseUri},?and?made ?
- ?????*?absolute?using?that. ?
- ?????*?<p/> ?
- ?????*?As?an?alternate,?you?can?use?the?{@link?#attr}?method?with?the?<code>abs:</code>?prefix,?e.g.: ?
- ?????*?<code>String?absUrl?=?linkEl.attr("abs:href");</code> ?
- ?????* ?
- ?????*?@param?attributeKey?The?attribute?key ?
- ?????*?@return?An?absolute?URL?if?one?could?be?made,?or?an?empty?string?(not?null)?if?the?attribute?was?missing?or ?
- ?????*?could?not?be?made?successfully?into?a?URL. ?
- ?????*?@see?#attr ?
- ?????*?@see?java.net.URL#URL(java.net.URL,?String) ?
- ?????*/ ??
- //看到这里大家应该明白绝对地址是怎么取的了 ??
- public ?String?absUrl(String?attributeKey)?{??
- ????????Validate.notEmpty(attributeKey);??
- ??
- ????????String?relUrl?=?attr(attributeKey);??
- ????????if ?(!hasAttr(attributeKey))?{??
- ????????????return ? "" ;? //?nothing?to?make?absolute?with ??
- ????????}?else ?{??
- ????????????URL?base;??
- ????????????try ?{??
- ????????????????try ?{??
- ????????????????????base?=?new ?URL(baseUri);??
- ????????????????}?catch ?(MalformedURLException?e)?{??
- ????????????????????//?the?base?is?unsuitable,?but?the?attribute?may?be?abs?on?its?own,?so?try?that ??
- ????????????????????URL?abs?=?new ?URL(relUrl);??
- ????????????????????return ?abs.toExternalForm();??
- ????????????????}??
- ????????????????//?workaround:?java?resolves?'//path/file?+??foo'?to?'//path/?foo',?not?'//path/file?foo'?as?desired ??
- ????????????????if ?(relUrl.startsWith( "?" ))??
- ????????????????????relUrl?=?base.getPath()?+?relUrl;??
- ????????????????URL?abs?=?new ?URL(base,?relUrl);??
- ????????????????return ?abs.toExternalForm();??
- ????????????}?catch ?(MalformedURLException?e)?{??
- ????????????????return ? "" ;??
- ????????????}??
- ????????}??
- ????}??
/** * Get an absolute URL from a URL attribute that may be relative (i.e. an <code><a href></code> or * <code><img src></code>). * <p/> * E.g.: <code>String absUrl = linkEl.absUrl("href");</code> * <p/> * If the attribute value is already absolute (i.e. it starts with a protocol, like * <code>http://</code> or <code>https://</code> etc), and it successfully parses as a URL, the attribute is * returned directly. Otherwise, it is treated as a URL relative to the element's {@link #baseUri}, and made * absolute using that. * <p/> * As an alternate, you can use the {@link #attr} method with the <code>abs:</code> prefix, e.g.: * <code>String absUrl = linkEl.attr("abs:href");</code> * * @param attributeKey The attribute key * @return An absolute URL if one could be made, or an empty string (not null) if the attribute was missing or * could not be made successfully into a URL. * @see #attr * @see java.net.URL#URL(java.net.URL, String) */ //看到这里大家应该明白绝对地址是怎么取的了 public String absUrl(String attributeKey) { Validate.notEmpty(attributeKey); String relUrl = attr(attributeKey); if (!hasAttr(attributeKey)) { return ""; // nothing to make absolute with } else { URL base; try { try { base = new URL(baseUri); } catch (MalformedURLException e) { // the base is unsuitable, but the attribute may be abs on its own, so try that URL abs = new URL(relUrl); return abs.toExternalForm(); } // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired if (relUrl.startsWith("?")) relUrl = base.getPath() + relUrl; URL abs = new URL(base, relUrl); return abs.toExternalForm(); } catch (MalformedURLException e) { return ""; } } }?
?
?