public static string DownloadHtmlPage(string pageUrl, Encoding encoding, string requestMethod, int timeOut)
{
string UserAgent = @"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11";
string value = string.Empty;
HttpWebResponse response = null;
Stream data = null;
StreamReader sr = null;
HttpWebRequest request;
try
{
if (pageUrl.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3;
request = WebRequest.Create(pageUrl) as HttpWebRequest;
request.ProtocolVersion = HttpVersion.Version10;
request.KeepAlive = false;
}
else
{
request = (HttpWebRequest)HttpWebRequest.Create(pageUrl);
request.KeepAlive = true;
}
request.Method = requestMethod;
request.ServicePoint.Expect100Continue = false;
//是否使用 Nagle 不使用 提高效率
request.ServicePoint.UseNagleAlgorithm = false;
//最大连接数
request.ServicePoint.ConnectionLimit = 65500;
//数据是否缓冲 false 提高效率
request.AllowWriteStreamBuffering = false;
request.UserAgent = UserAgent;
request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate ;
request.Headers.Add("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3");
//request.Headers.Add("Accept-Encoding", "gzip,deflate,sdch");
request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.8");
request.Headers.Add("Cache-Control", "max-age=0");
if (timeOut != -1) request.Timeout = timeOut;
response = (HttpWebResponse)request.GetResponse();
data = response.GetResponseStream();
sr = new StreamReader(data, encoding);
string str;
StringBuilder source = new StringBuilder();
while ((str = sr.ReadLine()) != null)
source.Append(str).Append("\r\n");
value = source.ToString();
}
catch
{
//MessageBox.Show(e.Message);
}
finally
{
if (sr != null) sr.Close();
if (data != null) data.Close();
if (response != null) response.Close();
}
return value;
}
有些站能抓到,比如https:\\www.baidu.com
但是有些站抓不到,会提示基础连接已经关闭,网上找了各种教程都没有彻底解决这个问题。求大神指点。
------解决思路----------------------
如果有些网站用了GZIP,而你HTTP头没有加入这个,且接收时没进行GZIP解压就会提示这个基础连接已经关闭
request.Headers.Add("Accept-Encoding: gzip, deflate"); 这个是关键
response = (HttpWebResponse)request.GetResponse();
if ((response.Headers["Content-Encoding"] != null) && (response.Headers["Content-Encoding"].ToString().IndexOf("gzip") > -1)) 这个也是关键。
{
GZipStream gstream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
reader = new StreamReader(gstream, encoding);
}
else
{
reader = new StreamReader(response.GetResponseStream(), this.encoding);
}
------解决思路----------------------
if (pageUrl.StartsWith("https", StringComparison.OrdinalIgnoreCase))
{
ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3;
request = WebRequest.Create(pageUrl) as HttpWebRequest;
request.ProtocolVersion = HttpVersion.Version10;
request.KeepAlive = false;
}
这个似乎是多余的.HttpWebRequest 自动适配