当前位置: 代码迷 >> PHP >> share 一段小说的采撷代码
  详细解决方案

share 一段小说的采撷代码

热度:246   发布时间:2016-04-28 23:32:14.0
share 一段小说的采集代码。
最近用火车头,ET采集小说,按他们的规则经常配不出来。碰到像小说520 里面的用iframe 的直接挂掉,只有自己写了一个,刚开始觉得就两正则解决[列表,内容]的事,写着写着便复杂起来了。
好好的改了几版,碰到最大的麻烦,,如何封装代码,就是采不同站时改动成本较小。这里小小的用了一个决策者模式。然后把该封装的功能一封。以及再次采集时,对已采集的章节的跳过机制,避免重采[毕竟一个小说站好几w篇文章,中断一次,接不上去,是很郁闷的事]

PHP code
class grep extends Controller {    var $tableName = 'grep';    var $pagesize =31;    var $order_string = "grep_order desc,grep_id desc";    var $filter_field = "grep_title";    var $check_repeat_field = "grep_title";     var $buttons = array(     );     var $description = "[爬取小说]";    function index(){    //get the story list        $story_model = "story_model";    $this->load->model($story_model);    $where = array("story_id < 445");    $rows_story = $this->$story_model->get($where);            foreach ($rows_story as $key=>$val_story):    if($key < 237) continue;          $url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;        $src_content = file_get_contents($url);        $src_content = iconv("GBK","utf-8//IGNORE",$src_content);                            $src_content = str_replace("/style=\"border-width:0px\s*1px\s*1px\s*0px;border-color:#C8D8B8;border-style:solid;padding:3px;float:left;width:313px;\"/i","",$src_content);            $src_content = str_replace("style=\"BORDER-RIGHT: #c8d8b8 1px solid; PADDING-RIGHT: 3px; BORDER-TOP: #c8d8b8 0px solid; PADDING-LEFT: 3px; FLOAT: left; PADDING-BOTTOM: 3px; BORDER-LEFT: #c8d8b8 0px solid; WIDTH: 313px; PADDING-TOP: 3px; BORDER-BOTTOM: #c8d8b8 1px solid\"","",$src_content);                          $src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);              $src_content = preg_replace("/title=\"[^\"]*\"/iU","",$src_content);            $src_content = preg_replace("/<LI[^>]*>/iU","",$src_content);            $src_content = preg_replace("/<\/LI[^>]*>/iU","",$src_content);            $src_content = preg_replace("/<a(?!href)[\d\D]*href/iU","<a href",$src_content);            $src_content = preg_replace('/\s(?=\s)/', '', $src_content);            $src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);            $src_content = preg_replace('/[\n\r\t]/', ' ', $src_content);            $src_content = str_replace("http://www.xiaoxiaoshuo.net/yanqingxiaoshuo2/tijiaxinniang/","",$src_content);                        preg_match_all("/<td\s*bgcolor=\"#EDF5EA\"([\d\D]*)<\/ul>/iU",$src_content,$arr_dstorycate);                                          $dstorycate_arr = $arr_dstorycate[1];                foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate)        {                  preg_match_all("/<font\s*color=\"#000000\">([^<]*)<\/font>/i",$val_dstory_cate,$dcate_title);                    $datacate["dstorycate_pid"]   = $val_story->story_id;                    $datacate["dstorycate_title"] = $dcate_title[1][0];                                    //获取类别对象,记将之前的类别标置为已下载                 $dtitle =$datacate["dstorycate_title"];                 $obj_storycate = $this->check_dcate($dtitle,$val_story );                                                                    //pr($obj_storycate);                 if($obj_storycate->dstorycate_ishot == 1)                 {                     $this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");                     continue;                 }                 preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);                $list_story_url   =  $dinfo_list[1];                $list_story_title =  $dinfo_list[2];                                $story_url = "http://www.xiaoxiaoshuo.net/".$val_story->storycate_vtitle."/".$val_story->story_vtitle;                $this->add_storyxxs_info($obj_storycate,$list_story_url,$list_story_title,$story_url);        }                     endforeach;    }                function status(){    $sql  = "select count(dstory_id) as all_story from dstory;";    $query = $this->db->query($sql);    $cont_all = $query->row();    echo $cont_all->all_story;        $sql  = "select count(dstory_id) as story1 from dstory where dstory_status = 1";    $query = $this->db->query($sql);    $cont_all = $query->row();    echo "--".$cont_all->story1;        $sql  = "select max(dstorycate_id) as max_id,max(dstorycate_pid) as max_pid from dstorycate";    $query = $this->db->query($sql);    $cont_all = $query->row();    echo "--".$cont_all->max_id."--".$cont_all->max_pid;    }        /** * *下载445后的章节 * */function index445(){    $story_model = "story_model";$this->load->model($story_model);$where = array("story_id > 445");$rows_story = $this->$story_model->get($where);foreach ($rows_story as $key=>$val_story):      //get the story_content$story_url_arr = explode("/",$val_story->story_url);$story_url= $story_url_arr[1]."/".$story_url_arr[2]."/".$story_url_arr[3]."/".$story_url_arr[4];$dest_url = "http://www.xiaoshuo520.com/".$story_url;$src_content = CS_file_get_contents($dest_url);$src_content = iconv("GBK","utf-8//IGNORE",$src_content);                //按类别进行分类分组数据.        preg_match_all("/(<div\s*id=\"NclassTitle\">[\d\D]*)<div\s*id=\"ListEnd/iU",$src_content,$arr_dstorycate);$dstorycate_arr = $arr_dstorycate[1];foreach ($dstorycate_arr as $key_dstorycate => $val_dstory_cate){     preg_match_all("/<div\s*id=\"NclassTitle\">([\d\D]*)<\/div>/i",$val_dstory_cate,$dcate_title);     $datacate["dstorycate_pid"]   = $val_story->story_id;     $datacate["dstorycate_title"] = $dcate_title[1][0];     //获取类别对象,记将之前的类别标置为已下载     $dtitle =$datacate["dstorycate_title"];     $obj_storycate = $this->check_dcate($dtitle,$val_story );     //pr($obj_storycate);     if($obj_storycate->dstorycate_ishot == 1)     {         $this->log( "<font color = gray>已此章节已抓取完 $val_story->story_title - $dtitle </font>,跳过");         continue;     }          preg_match_all("/<a\s*href=\"([^\"]*)\"[^>]*>(?!<\/a>)([\d\D]*)<\/a>/iU",$val_dstory_cate,$dinfo_list);    $list_story_url   =  $dinfo_list[1];    $list_story_title =  $dinfo_list[2];    $this->add_story520_info($obj_storycate,$list_story_url,$list_story_title,$story_url);}endforeach;}/** * 根据章节标准 和小说对象,获取标题对象 * */function check_dcate($title,$obj_story){                $dstorycate_model = "dstorycate_model";    $this->load->model($dstorycate_model);        $where = array("dstorycate_pid = $obj_story->story_id","dstorycate_title = '$title'");    $rows = $this->$dstorycate_model->get($where);        if(!$rows)    {        $datacate["dstorycate_pid"]   = $obj_story->story_id;        $datacate["dstorycate_title"] = $title;                $this->$dstorycate_model->insert($datacate);        $obj_cate_id = $this->db->insert_id();        $where = array("dstorycate_id = $obj_cate_id","dstorycate_title = '$title'");        $rows = $this->$dstorycate_model->get($where);        $this->log( "此书没有相关类别,将进行添加 小说$obj_story->story_title - $title ");    }else {        $this->log( "已存在相关小说类别 $obj_story->story_title - $title ,跳过");    }    $obj_cate = $rows[0];        $sql = "update dstorycate set dstorycate_published = 1 where dstorycate_pid = $obj_story->story_id && dstorycate_id < $obj_cate->dstorycate_id ";        $this->db->query($sql);    return $obj_cate;}/*** *添加小说对旬  */function add_story520_info($cate_obj,$list_story_url,$list_story_title,$url){    $dstory_model = "dstory_model";    $this->load->model($dstory_model);    $min_key = intval($cate_obj->dstorycate_pvcount);    if(!$min_key) $min_key = 0 ;    foreach($list_story_url as $key=>$val):      if($key < $min_key) {           continue;     }            $this->check_dstory($cate_obj,"http://www.xiaoshuo520.com/".$url."/".$val,$list_story_title[$key],"grep_520_info");    endforeach;}function add_storyxxs_info($cate_obj,$list_story_url,$list_story_title,$url){    $dstory_model = "dstory_model";    $this->load->model($dstory_model);    $min_key = intval($cate_obj->dstorycate_pvcount);    if(!$min_key) $min_key = 0 ;         foreach($list_story_url as $key=>$val):         if($key < $min_key) {             $this->log("$cate_obj->dstorycate_id 号 $cate_obj->dstorycate_title ".$list_story_title[$key]." 章 $key < $min_key "); continue;                     }     $this->check_dstory($cate_obj,$url."/".$val,$list_story_title[$key],"grep_xxs_info");    endforeach;        }
  相关解决方案