冰之龙原创 冰之龙代码 因为正则表达式不支持中文所以在MP3下载列表中没有包含中文连接 要包含中文连接也很容易,自己抓取http MP3用字符串查找 jsoup下载地址为[url=http://jsoup.org/download]http://jsoup.org/download[/url] 代码
1 楼
zzjb011
2012-04-12
package soso.tool;
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
//import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*
* jsoup java 抓取百度MP3 top500 ----冰之龙代码
* 冰之龙原创 冰之龙代码
* 时间 2012-4-12 1:10:49
* */
public class MyUrls {
/**
* @param args
*/
public MyUrls(String url) {
startUrl = url;
}
String startUrl;
Document doc;
Elements links;
Elements media;
Elements imports;
ArrayList<UrlAndTitle> al = new ArrayList<UrlAndTitle>();
ArrayList<SongInfo> songInfolList = new ArrayList<MyUrls.SongInfo>();
public boolean getUrlContent() {
try {
doc = Jsoup.connect(startUrl).get();
} catch (IOException e) {
// TODO: handle exception
System.out.println(e.getMessage());
return false;
}
return true;
}
public void initElements() {
links = doc.select("a[href]");
media = doc.select("[src]");
imports = doc.select("link[href]");
}
public void UrlsLinksArrayListDisplay() {
int i = 0;
for (UrlAndTitle uat : al) {
i++;
System.out.println("标题:" + uat.title);
System.out.println("网址:" + uat.myURL);
System.out.println();
}
System.out.println("共有" + i + "个符合结果");
}
public void initUrlsLinksArrayList() {
al.clear();
for (Element link : links) {
UrlAndTitle uat = new UrlAndTitle();
uat.myURL = link.attr("abs:href");
uat.title = trim(link.text(), 35 * 10);
al.add(uat);
}
}
private static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width - 1) + ".";
else
return s;
}
public void initEverySongSoSoPara() {
String partHare = "http://mp3.baidu.com/m?rf=top-index&tn=baidump";
String sosoH = "&word=";
String sosoE = "&lm=";
for (UrlAndTitle uat : al) {
if (uat.myURL.contains(partHare)) {
String song = uat.myURL.substring(uat.myURL.indexOf(sosoH)
+ sosoH.length(), uat.myURL.indexOf(sosoE));
SongInfo songInfo = new SongInfo();
if (song.contains("+")) {
song = song.replace('+', ':');
// System.out.println(song);
String[] songI = song.split(":");
songInfo.name = songI[0];
if (songI.length == 1) {
} else
songInfo.actor = songI[1];
songInfo.sosoList = uat.myURL;
songInfolList.add(songInfo);
} else {
songInfo.name = song;
songInfo.sosoList = uat.myURL;
songInfolList.add(songInfo);
}
}
}
}
public void SongInfoArrayListDisplay() {
int i = 0;
for (SongInfo si : songInfolList) {
i++;
System.out.println(i + ":");
System.out.println("歌曲名称:" + si.name);
System.out.println("艺术家:" + si.actor);
System.out.println("网址:" + si.sosoList);
System.out.println();
}
System.out.println("共有" + i + "个符合结果");
}
public void fillMp3DownList(SongInfo songInfo) {
if (songInfo.mp3downList == null) {
songInfo.mp3downList = new ArrayList<String>();
}
ArrayList<String> arrayList = getEveryDownList(songInfo.sosoList);
for (String string : arrayList) {
songInfo.mp3downList.addAll(getMp3List(string));
}
delMP3List(songInfo.mp3downList ) ;
}
public void songInfomp3downListDisplay(SongInfo songInfo){
System.out.println("歌曲名称:" + songInfo.name);
System.out.println("艺术家:" + songInfo.actor);
for (String string : songInfo.mp3downList) {
System.out.println(string);
}
}
public void delMP3List(ArrayList<String> mp3List) {
ArrayList<String> tempList = new ArrayList<String>();
String temp="";
for (String string:mp3List) {
if (!temp.contains(string)) {
temp=temp+' '+string;
tempList.add(string);
}
}
mp3List.clear();
mp3List.addAll(tempList);
}
private ArrayList<String> everyDownList = new ArrayList<String>();
public ArrayList<String> getEveryDownList(String url) {
everyDownList = new ArrayList<String>();
startUrl = url;
if (getUrlContent()) {
initElements();
initUrlsLinksArrayList();
fillEveryDownList();
// UrlsLinksArrayListDisplay();
}
return everyDownList;
}
public void fillEveryDownList() {
String startWith = "http://box.zhangmen.baidu.com/m?word=mp3";
String contains = "baidusg,";
for (UrlAndTitle uat : al) {
if (uat.myURL.startsWith(startWith) && uat.myURL.contains(contains)) {
everyDownList.add(uat.myURL);
// System.out.println(uat.myURL);
}
}
}
public ArrayList<String> getMp3List(String url) {
ArrayList<String> mp3List = new ArrayList<String>();
startUrl = url;
if (getUrlContent()) {
//initElements();
// System.out.println(doc.html());
//getBaiduSongs();
// initUrlsLinksArrayList();
// UrlsLinksArrayListDisplay();
fillMP3List(mp3List);
}
return mp3List;
}
private static String getFilteredContent(String htmlContent, String reg,
int i) {
String content = "";
return content;
}
public void fillMP3List(ArrayList<String> mp3List) {
String endWith = ".mp3";
String no = "...";
Pattern pattern = Pattern.compile("(http://|https://){1}[\\w\\.\\-/:]+");
Matcher matcher = pattern.matcher(doc.html());
//StringBuffer bfr = new StringBuffer();
while(matcher.find()){
String url=matcher.group();
/* bfr.append(url);
bfr.append("\r\n"); */
if (url.contains(endWith)&&!url.contains(no)) {
mp3List.add(url);
//System.out.println(url);
}
//System.out.println(bfr.toString());
}
}
public ArrayList<String> getBaiduSongs(){
ArrayList<String> ss = new ArrayList<String>();
String reg = "(.*?)";
getFilteredContent(doc.html(),reg,0);
return ss;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String url =
// "http://mp3.baidu.com/m?rf=top-index&tn=baidump3&ct="
// + "134217728&word=因为爱情+王菲,陈奕迅&lm=-1";
"http://list.mp3.baidu.com/top/top500.html";
// "http://67.220.90.30/bbs/archiver/fid-143.html";
// "http://67.220.90.30/bbs/forum-143-5.html";
// "http://205.164.48.253/simple/?f138_57.html";
// "http://tu.uuu9.com/List/List_8.shtml";
MyUrls mu = new MyUrls(url);
if (mu.getUrlContent()) {
mu.initElements();
mu.initUrlsLinksArrayList();
// mu.UrlsLinksArrayListDisplay();
mu.initEverySongSoSoPara();
// mu.SongInfoArrayListDisplay();
for (int i = 0; i < mu.songInfolList.size(); i++) {
mu.fillMp3DownList(mu.songInfolList.get(i));
mu.songInfomp3downListDisplay(mu.songInfolList.get(i));
}
}
}
class UrlAndTitle {
String myURL;
String title;
}
class SongInfo {
String name;
String actor;
String sosoList;
ArrayList<String> mp3downList = null;
}
}
import java.io.IOException;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
//import org.jsoup.helper.Validate;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/*
* jsoup java 抓取百度MP3 top500 ----冰之龙代码
* 冰之龙原创 冰之龙代码
* 时间 2012-4-12 1:10:49
* */
public class MyUrls {
/**
* @param args
*/
public MyUrls(String url) {
startUrl = url;
}
String startUrl;
Document doc;
Elements links;
Elements media;
Elements imports;
ArrayList<UrlAndTitle> al = new ArrayList<UrlAndTitle>();
ArrayList<SongInfo> songInfolList = new ArrayList<MyUrls.SongInfo>();
public boolean getUrlContent() {
try {
doc = Jsoup.connect(startUrl).get();
} catch (IOException e) {
// TODO: handle exception
System.out.println(e.getMessage());
return false;
}
return true;
}
public void initElements() {
links = doc.select("a[href]");
media = doc.select("[src]");
imports = doc.select("link[href]");
}
public void UrlsLinksArrayListDisplay() {
int i = 0;
for (UrlAndTitle uat : al) {
i++;
System.out.println("标题:" + uat.title);
System.out.println("网址:" + uat.myURL);
System.out.println();
}
System.out.println("共有" + i + "个符合结果");
}
public void initUrlsLinksArrayList() {
al.clear();
for (Element link : links) {
UrlAndTitle uat = new UrlAndTitle();
uat.myURL = link.attr("abs:href");
uat.title = trim(link.text(), 35 * 10);
al.add(uat);
}
}
private static String trim(String s, int width) {
if (s.length() > width)
return s.substring(0, width - 1) + ".";
else
return s;
}
public void initEverySongSoSoPara() {
String partHare = "http://mp3.baidu.com/m?rf=top-index&tn=baidump";
String sosoH = "&word=";
String sosoE = "&lm=";
for (UrlAndTitle uat : al) {
if (uat.myURL.contains(partHare)) {
String song = uat.myURL.substring(uat.myURL.indexOf(sosoH)
+ sosoH.length(), uat.myURL.indexOf(sosoE));
SongInfo songInfo = new SongInfo();
if (song.contains("+")) {
song = song.replace('+', ':');
// System.out.println(song);
String[] songI = song.split(":");
songInfo.name = songI[0];
if (songI.length == 1) {
} else
songInfo.actor = songI[1];
songInfo.sosoList = uat.myURL;
songInfolList.add(songInfo);
} else {
songInfo.name = song;
songInfo.sosoList = uat.myURL;
songInfolList.add(songInfo);
}
}
}
}
public void SongInfoArrayListDisplay() {
int i = 0;
for (SongInfo si : songInfolList) {
i++;
System.out.println(i + ":");
System.out.println("歌曲名称:" + si.name);
System.out.println("艺术家:" + si.actor);
System.out.println("网址:" + si.sosoList);
System.out.println();
}
System.out.println("共有" + i + "个符合结果");
}
public void fillMp3DownList(SongInfo songInfo) {
if (songInfo.mp3downList == null) {
songInfo.mp3downList = new ArrayList<String>();
}
ArrayList<String> arrayList = getEveryDownList(songInfo.sosoList);
for (String string : arrayList) {
songInfo.mp3downList.addAll(getMp3List(string));
}
delMP3List(songInfo.mp3downList ) ;
}
public void songInfomp3downListDisplay(SongInfo songInfo){
System.out.println("歌曲名称:" + songInfo.name);
System.out.println("艺术家:" + songInfo.actor);
for (String string : songInfo.mp3downList) {
System.out.println(string);
}
}
public void delMP3List(ArrayList<String> mp3List) {
ArrayList<String> tempList = new ArrayList<String>();
String temp="";
for (String string:mp3List) {
if (!temp.contains(string)) {
temp=temp+' '+string;
tempList.add(string);
}
}
mp3List.clear();
mp3List.addAll(tempList);
}
private ArrayList<String> everyDownList = new ArrayList<String>();
public ArrayList<String> getEveryDownList(String url) {
everyDownList = new ArrayList<String>();
startUrl = url;
if (getUrlContent()) {
initElements();
initUrlsLinksArrayList();
fillEveryDownList();
// UrlsLinksArrayListDisplay();
}
return everyDownList;
}
public void fillEveryDownList() {
String startWith = "http://box.zhangmen.baidu.com/m?word=mp3";
String contains = "baidusg,";
for (UrlAndTitle uat : al) {
if (uat.myURL.startsWith(startWith) && uat.myURL.contains(contains)) {
everyDownList.add(uat.myURL);
// System.out.println(uat.myURL);
}
}
}
public ArrayList<String> getMp3List(String url) {
ArrayList<String> mp3List = new ArrayList<String>();
startUrl = url;
if (getUrlContent()) {
//initElements();
// System.out.println(doc.html());
//getBaiduSongs();
// initUrlsLinksArrayList();
// UrlsLinksArrayListDisplay();
fillMP3List(mp3List);
}
return mp3List;
}
private static String getFilteredContent(String htmlContent, String reg,
int i) {
String content = "";
return content;
}
public void fillMP3List(ArrayList<String> mp3List) {
String endWith = ".mp3";
String no = "...";
Pattern pattern = Pattern.compile("(http://|https://){1}[\\w\\.\\-/:]+");
Matcher matcher = pattern.matcher(doc.html());
//StringBuffer bfr = new StringBuffer();
while(matcher.find()){
String url=matcher.group();
/* bfr.append(url);
bfr.append("\r\n"); */
if (url.contains(endWith)&&!url.contains(no)) {
mp3List.add(url);
//System.out.println(url);
}
//System.out.println(bfr.toString());
}
}
public ArrayList<String> getBaiduSongs(){
ArrayList<String> ss = new ArrayList<String>();
String reg = "(.*?)";
getFilteredContent(doc.html(),reg,0);
return ss;
}
public static void main(String[] args) {
// TODO Auto-generated method stub
String url =
// "http://mp3.baidu.com/m?rf=top-index&tn=baidump3&ct="
// + "134217728&word=因为爱情+王菲,陈奕迅&lm=-1";
"http://list.mp3.baidu.com/top/top500.html";
// "http://67.220.90.30/bbs/archiver/fid-143.html";
// "http://67.220.90.30/bbs/forum-143-5.html";
// "http://205.164.48.253/simple/?f138_57.html";
// "http://tu.uuu9.com/List/List_8.shtml";
MyUrls mu = new MyUrls(url);
if (mu.getUrlContent()) {
mu.initElements();
mu.initUrlsLinksArrayList();
// mu.UrlsLinksArrayListDisplay();
mu.initEverySongSoSoPara();
// mu.SongInfoArrayListDisplay();
for (int i = 0; i < mu.songInfolList.size(); i++) {
mu.fillMp3DownList(mu.songInfolList.get(i));
mu.songInfomp3downListDisplay(mu.songInfolList.get(i));
}
}
}
class UrlAndTitle {
String myURL;
String title;
}
class SongInfo {
String name;
String actor;
String sosoList;
ArrayList<String> mp3downList = null;
}
}