定义一个http抓包类,发送数据到一个自定义的接受脚本,可以发送成功,并收取数据,但是发送到外网,却不行,分析过在浏览器下发送HTTP请求时的request header 信息,通过模拟请求,但超时...
//定义一个HTTP抓包类,其实也可以用curl。。。。。
<?php
ini_set('error_reporting', E_ALL);
class Httpwrap
{
private $hostInfo=null;
private $requestLine=null;
private $requestHeader=null;
private $emptyLine="\r\n";
private $requestBody=null;
private $requestEntity=null;
private $responseEntity=null;
private $responseHeader=null;
private $responseBody=null;
private $emptyLinePos=null;
private $connect=null;
private $errNo=null;
private $errStr=null;
public function __construct($url)
{
$this->hostInfo=parse_url($url);
$this->setRequestHeader(array('Host' => $this->hostInfo['host']));
$this->setRequestHeader(array('Connection' => 'keep-alive'));
}
//设置HTTP请求行信息,例如: GET /resources HTTP/1.1
//但为了避免漏掉url中?开始的查询信息,有必要进行判断
public function setRequestLine($method)
{
//如果是POST请求,则自动添加content-type头信息
if(strtolower($method)=='post')
{
$this->setRequestHeader(array('Content-type' => 'application/x-www-form-urlencoded'));
}
if(!empty($this->hostInfo['query']))
{
$this->requestLine=strtoupper($method)." ".$this->hostInfo['path']."?".$this->hostInfo['query']." HTTP/1.1 \r\n";
}
else
{
$this->requestLine=strtoupper($method)." ".$this->hostInfo['path']." HTTP/1.1 \r\n";
}
}
//设置HTTP请求头。
//接收参数是数组类型,通过迭代拼接key:value,并换行
public function setRequestHeader($header)
{
foreach($header as $key => $value)
{
$this->requestHeader .=$key.":".$value."\r\n";
}
}
//设置HTTP请求体
//接收参数是数组类型,通过迭代拼接key=value,因为最后一席拼接会有一个多余的&,所以有必要去掉
public function setRequestBody($body)
{
foreach($body as $key => $value)
{
$this->requestBody .=$key.'='.$value.'&';
}
$offset=strrpos($this->requestBody, '&');
$this->requestBody=substr($this->requestBody, 0, $offset);
}
//组装 请求行+请求头+请求体,并根据请求体的长度,自动填充请求头的content-length字段
public function setRequestEntity()
{
if(!empty($this->requestBody))
{
$contentLength=strlen($this->requestBody);
$this->setRequestHeader(array('Content-length' => $contentLength));
$this->requestEntity=$this->requestLine.$this->requestHeader."\r\n".$this->requestBody;
}
else
{
$this->requestEntity=$this->requestLine.$this->requestHeader."\r\n";
}
}
//解析主机名的函数,暂时没有用上.......
public function parseHost($url)
{
$pat='#http://([^/]+)#i';
if(preg_match($pat, $url, $match))
{
return $match[1];
}
else
{
echo '匹配主机信息失败<br />';
}
}
//创建到主机的连接
public function createConnect()
{
$this->connect=fsockopen($this->hostInfo['host'], 80, $this->errNo, $this->errStr) or die('连接主机失败'.$this->errStr);
}
//发送请求
public function sendRequest()
{
$this->setRequestEntity();
echo $this->requestEntity;
exit();
$this->createConnect();
$entityLength=strlen($this->requestEntity);
if($entityLength != fwrite($this->connect, $this->requestEntity, $entityLength))
{
die('写入数据失败<br />');
}
else
{
$this->receiveResponse();
}
}
//接受请求,并依次拼接响应体
public function receiveResponse()
{
while(!feof($this->connect))
{
$this->responseEntity .= fread($this->connect, 1024);
}
}
//计算响应头与响应体之间的空行的位置
public function calculateEmptyLinePos()
{
$this->emptyLinePos=strpos($this->responseEntity,"\r\n\r\n",0);
}
//接受响应体的头部....
public function receiveResponseHeader()
{
$this->calculateEmptyLinePos();
$this->responseHeader=substr($this->responseEntity, 0, $this->emptyLinePos);
echo $this->responseHeader;
}
//接收响应体的body部分
public function receiveResponseBody()
{
$this->calculateEmptyLinePos();
$this->responseBody=substr($this->responseEntity, $this->emptyLinePos);
}
//返回请求结果
public function getResponse()
{
return $this->responseEntity;
}
public function parseResponse()
{}
public function __destruct()
{
//fclose($this->connect);
}
}
set_time_limit(60);
$http=new Httpwrap("http://www.mmkao.com/Beautyleg/");
//设置HTTP请求行
$http->setRequestLine("get");
//设置HTTP头
$http->setRequestHeader(array("Accept" => "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"));
$http->setRequestHeader(array("Accept-Language" => "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"));
$http->setRequestHeader(array("Accept-Encoding" => "gzip, deflate"));
$http->setRequestHeader(array("User-Agent" => "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36"));
//$http->setRequestHeader(array("Cookie" => "BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=6; AJSTAT_ok_pages=2; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712"));
//发送数据
$http->sendRequest();
//$http->receiveResponseHeader();
?>
通过这个类给领一个自定义的脚本,可以发送和接收数据,领一个脚本如下:
<?php
if(!empty($_POST))
{
$str=implode(',',$_POST);
file_put_contents('./post.txt', $str,FILE_APPEND);
echo $str;
}
?>
但是给这个网站发送请求时,却超时:网站是:
http://www.mmkao.com/Beautyleg/
通过chrome给这个网站首页发送请求时的header头信息:
Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
Accept-Encoding:gzip,deflate,sdch
Accept-Language:zh,en;q=0.8,zh-TW;q=0.6,zh-CN;q=0.4,ja;q=0.2
Cache-Control:max-age=0
Connection:keep-alive
Cookie:BAIDU_DUP_lcr=http://www.baidu.com/s?wd=beautyleg&rsv_spt=1&issp=1&f=3&rsv_bp=0&rsv_idx=2&ie=utf-8&tn=baiduhome_pg&rsv_enter=1&rsv_sug3=6&rsv_sug4=415&rsv_sug1=3&oq=beauty&rsv_sug2=0&rsp=0&inputT=2363; safedog-flow-item=8471BA510DA33350ED344AC374D3044A; bdshare_firstime=1415165097782; cscpvrich_fidx=7; AJSTAT_ok_pages=3; AJSTAT_ok_times=2; CNZZDATA3811623=cnzz_eid%3D253823549-1415164312-http%253A%252F%252Fwww.baidu.com%252F%26ntime%3D1415169712
DNT:1
Host:www.mmkao.com
User-Agent:Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.101 Safari/537.36
Response Headersview source
//通过相同的包装,并调用Httpwrap发送请求时,却提示超时,是在不知道哪里出问题........
针对这个网站写了一个过滤出图片链接的类:
<?php
class Parseimage
{
private $responseBody=null;
private $imgLink=null;
private $pageNum=null;
private header=null;
private body=null;
public function __construct($body)
{
$this->responseBody=$body;
}
//匹配图片src开始的链接地址
public function feedImage()
{
$pat='#<img(.*?)src="(.*?)(?=")#i';
if(preg_match_all($pat, $body, $match))
{
foreach($match[2] as $link)
{
$this->imgLink[]=$link;
}
}
else
{
echo '匹配失败图片链接地址失败'."<br />";
}
}
//提取head部分
public function filterHeader($body)
{
$pat='#<head>[\s\S]+</head>#im';
if(preg_match($pat, $body, $match))
{
$this->header=$match[0];
}
else
{
echo '匹配head部分失败'."<br />";
}
}
//提取body部分
public function filterBody($body)
{
$pat='#<body>[\s\S]+</body>#im';
if(preg_match($pat, $body, $match))
{
$this->body=$match[0];
}
else
{
echo '匹配body部分失败'."<br />";
}
}
//提取分页信息,这个只能针对性的匹配,不能通用
public function rollPage($body)
{
$pat='#[\x{4e00}-\x{9fa5}]+\s*\d\s+?/\s+?\d+\s*[\x{4e00}-\x{9fa5}]*#ui';
if(preg_match($pat, $body, $match))
{
$patNum='#/\s*(\d\d*)#';
if(preg_match($patNum, $match[0], $num))
{
$this->pageNum=$num[1];
}
else
{
echo '提取分页具体值失败'."<br />";
}
}
else
{
echo '提取分页统计失败'."<br />";
}
}
?>
附注: 这两个类,,都通过了内网的测试,并成功过滤出图片的链接,但是给http://www.mmkao.com/Beautyleg/发送请求时,却提示超时,,不知道哪里出了问题。。。。。。
------解决思路----------------------
$url = 'http://www.mmkao.com/Beautyleg/';
print_r(get_headers($url));
Array
(
[0] => HTTP/1.1 200 OK
[1] => Connection: close
[2] => Date: Wed, 05 Nov 2014 08:53:09 GMT
[3] => Content-Length: 13889
[4] => Content-Type: text/html
[5] => Content-Location: http://www.mmkao.com/Beautyleg/index.html
[6] => Last-Modified: Wed, 05 Nov 2014 05:39:09 GMT
[7] => Accept-Ranges: bytes
[8] => ETag: "e8939ad2baf8cf1:693"
[9] => Server: IIS
[10] => X-Powered-By: WAF/2.0
[11] => Set-Cookie: safedog-flow-item=8471BA510DA33350ED344AC374D3044A; expires=Sat, 12-Dec-2150 10:26:25 GMT; domain=mmkao.com; path=/
)