现在有一个需求。比如搜索
准备怀孕要做哪些检查 在百度中我公司网站的排名...
需求挺简单的
~~~
大家可以测试一下.使用各种方式
如WebClient,HttpRequest,XMLhttp,WebBrowser去请求百度,取到的HTML源码不尽相同!!!
比如请求
http://www.baidu.com/s?wd=准备怀孕要做哪些检查
将出现以下问题
问题1 : HTML源码不同,导致排位不同,
问题2 : 如果某一种方式请求过多,会出现验证码,但是这时候用IE去访问,是不需要验证码的。
我的分析是
百度可以区分来自浏览器访问 与 各种Com组件或是代码方式的请求
但是:
我使用各种HTTP访问监视工具,如
Fiddler记录下IE的请求时发送的头信息,然后用HttpRequest去一一对应·还是出现这样的问题
下面发代码!
使用HttpRequest自定义方式
- C# code
public static string DownLoadHtml(string url) { try { CookieContainer c = new CookieContainer(); Uri u = new Uri("http://www.baidu.com"); CookieCollection ccs = new CookieCollection(); ccs.Add(new Cookie("BAIDUID", System.Guid.NewGuid().ToString().ToUpper().Replace("-", "") + ":FG=1")); c.Add(u, ccs); HttpWebRequest r = (HttpWebRequest)WebRequest.Create(url); //r.Headers["Cache-Control"] = "no-cache"; //r.Headers["Pragma"] = "no-cache"; r.UserAgent = @"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET4.0C; .NET4.0E)"; r.Accept = @"*/*"; r.Host = "www.baidu.com"; r.Headers["Accept-Encoding"] = "gzip, deflate"; r.Headers["Accept-Language"] = "zh-cn"; r.Method = "get"; //r.Referer = "http://www.baidu.com"; r.CookieContainer = c; r.AllowAutoRedirect = true; HttpWebResponse rep = (HttpWebResponse)r.GetResponse(); Stream receiveStream = rep.GetResponseStream(); string data = string.Empty; string sResponseHeader = rep.Headers["Content-Encoding"]; if (!string.IsNullOrEmpty(sResponseHeader)) { if (sResponseHeader.ToLower().Contains("gzip")) { byte[] b = DecompressGzip(receiveStream); data = System.Text.Encoding.GetEncoding("gb2312").GetString(b); } else if (sResponseHeader.ToLower().Contains("deflate")) { byte[] b = DecompressDeflate(receiveStream); data = System.Text.Encoding.GetEncoding("gb2312").GetString(b); } } // RegexOptions options = RegexOptions.None | RegexOptions.Singleline; Regex regex = new Regex("<title>(?<title>.*?)</title>", options); MatchCollection matches = regex.Matches(data); if (matches != null) { foreach (Match m in matches) { if (m.Groups["title"].Value.IndexOf("您的访问出错了") >= 0) { data = "err:访问出错,需填写验证码"; break; } } } return data; } catch(Exception er) { return "err:"+er.Message; } } private static byte[] DecompressGzip(Stream streamInput) { Stream streamOutput = new MemoryStream(); int iOutputLength = 0; try { byte[] readBuffer = new byte[4096]; /// read from input stream and write to gzip stream using (GZipStream streamGZip = new GZipStream(streamInput, CompressionMode.Decompress)) { int i; while ((i = streamGZip.Read(readBuffer, 0, readBuffer.Length)) != 0) { streamOutput.Write(readBuffer, 0, i); iOutputLength = iOutputLength + i; } } } catch { // todo: handle exception } /// read uncompressed data from output stream into a byte array byte[] buffer = new byte[iOutputLength]; streamOutput.Position = 0; streamOutput.Read(buffer, 0, buffer.Length); return buffer; } private static byte[] DecompressDeflate(Stream streamInput) { Stream streamOutput = new MemoryStream(); int iOutputLength = 0; try { byte[] readBuffer = new byte[4096]; /// read from input stream and write to gzip stream using (DeflateStream streamGZip = new DeflateStream(streamInput, CompressionMode.Decompress)) { int i; while ((i = streamGZip.Read(readBuffer, 0, readBuffer.Length)) != 0) { streamOutput.Write(readBuffer, 0, i); iOutputLength = iOutputLength + i; } } } catch { // todo: handle exception } /// read uncompressed data from output stream into a byte array byte[] buffer = new byte[iOutputLength]; streamOutput.Position = 0; streamOutput.Read(buffer, 0, buffer.Length); return buffer; }