仿照http://blog.csdn.net/pleasecallmewhy/article/details/17630063写一个简单的抓取知乎推荐(http://www.zhihu.com/explore/recommendations)的爬虫,读出的网页源码总是乱码,而同样的代码读取别的网站都是正常的。是知乎网站本身做了什么限制吗?有人说知乎输出的内容是json格式,是否有关系?
爬虫代码如下:
package test;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.*;
public class Main {
static String sendGet(String url) throws IOException{
//String url="http://www.baidu.com";
String result="";
BufferedReader in=null;
try {
URL realUrl=new URL(url);
URLConnection connection=realUrl.openConnection();
connection.connect();
in= new BufferedReader(new InputStreamReader(connection.getInputStream(),"UTF-8"));
String line;
while ((line=in.readLine())!=null){
result+=line;
}
} catch (MalformedURLException e) {
System.out.print("发送GET请求出现异常!" + e);
// TODO Auto-generated catch block
e.printStackTrace();
}
finally{
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
static String RegexString(String tartgetStr,String patterStr){
Pattern p=Pattern.compile(patterStr);
Matcher matcher=p.matcher(tartgetStr);
if (matcher.find()){
return matcher.group(1);
}
return "nothing";
}
public static void main(String[] args) throws IOException{
String url="http://www.zhihu.com/explore/recommendations";
String result=sendGet(url);
System.out.println(result);
}
}
输出的乱码是这种形式的:
------解决思路----------------------
static String sendGet(String url) throws IOException {
// String url="http://www.baidu.com";
String result = "";
BufferedReader in = null ;
try {
URL realUrl = new URL(url);
URLConnection connection = realUrl.openConnection();
connection.connect();
InputStream urlStream = new GZIPInputStream(connection.getInputStream());
in = new BufferedReader(new InputStreamReader(urlStream,"utf-8"));
String line = "";
while ((line = in.readLine()) != null) {
System.out.println(line);
result += line;
}
} catch (MalformedURLException e) {
System.out.print("发送GET请求出现异常!" + e);
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}