当前位置: 代码迷 >> C# >> 用代码抓网页有点https抓不到
  详细解决方案

用代码抓网页有点https抓不到

热度:40   发布时间:2016-05-05 04:45:09.0
用代码抓网页有些https抓不到

public static string DownloadHtmlPage(string pageUrl, Encoding encoding, string requestMethod, int timeOut)
        {
            string UserAgent = @"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 Safari/537.11";
            string value = string.Empty;
            HttpWebResponse response = null;
            Stream data = null;
            StreamReader sr = null;
            HttpWebRequest request;
            try
            {
                if (pageUrl.StartsWith("https", StringComparison.OrdinalIgnoreCase))
                {
                    ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
                    ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3;
                    request = WebRequest.Create(pageUrl) as HttpWebRequest;
                    request.ProtocolVersion = HttpVersion.Version10;
                    request.KeepAlive = false;
                }
                else
                {
                    request = (HttpWebRequest)HttpWebRequest.Create(pageUrl);
                    request.KeepAlive = true;
                }
                request.Method = requestMethod;
                request.ServicePoint.Expect100Continue = false;
                //是否使用 Nagle 不使用 提高效率
                request.ServicePoint.UseNagleAlgorithm = false;
                //最大连接数
                request.ServicePoint.ConnectionLimit = 65500;
                //数据是否缓冲 false 提高效率
                request.AllowWriteStreamBuffering = false;
                request.UserAgent = UserAgent;

                request.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate ;
                request.Headers.Add("Accept-Charset", "GBK,utf-8;q=0.7,*;q=0.3");
                //request.Headers.Add("Accept-Encoding", "gzip,deflate,sdch");
                request.Headers.Add("Accept-Language", "zh-CN,zh;q=0.8");
                request.Headers.Add("Cache-Control", "max-age=0");
                if (timeOut != -1) request.Timeout = timeOut;
                response = (HttpWebResponse)request.GetResponse();
                data = response.GetResponseStream();
                sr = new StreamReader(data, encoding);
                string str;
                StringBuilder source = new StringBuilder();
                while ((str = sr.ReadLine()) != null)
                    source.Append(str).Append("\r\n");
                value = source.ToString();
            }
            catch
            {
                //MessageBox.Show(e.Message);
            }
            finally
            {
                if (sr != null) sr.Close();
                if (data != null) data.Close();
                if (response != null) response.Close();
            }
            return value;
        }

有些站能抓到,比如https:\\www.baidu.com
但是有些站抓不到,会提示基础连接已经关闭,网上找了各种教程都没有彻底解决这个问题。求大神指点。
------解决思路----------------------

如果有些网站用了GZIP,而你HTTP头没有加入这个,且接收时没进行GZIP解压就会提示这个基础连接已经关闭




request.Headers.Add("Accept-Encoding: gzip, deflate"); 这个是关键




 response = (HttpWebResponse)request.GetResponse();


                if ((response.Headers["Content-Encoding"] != null) && (response.Headers["Content-Encoding"].ToString().IndexOf("gzip") > -1)) 这个也是关键。
                {

                    GZipStream gstream = new GZipStream(response.GetResponseStream(), CompressionMode.Decompress);
                    reader = new StreamReader(gstream, encoding);

                }
                else
                {
                    reader = new StreamReader(response.GetResponseStream(), this.encoding);
                }
               



------解决思路----------------------
if (pageUrl.StartsWith("https", StringComparison.OrdinalIgnoreCase))
                {
                    ServicePointManager.ServerCertificateValidationCallback = new RemoteCertificateValidationCallback(CheckValidationResult);
                    ServicePointManager.SecurityProtocol = SecurityProtocolType.Ssl3;
                    request = WebRequest.Create(pageUrl) as HttpWebRequest;
                    request.ProtocolVersion = HttpVersion.Version10;
                    request.KeepAlive = false;
                }


这个似乎是多余的.HttpWebRequest 自动适配
  相关解决方案