飞鸟、战神以及零距离站群这类需要直接调用txt文档的站群程序,都可以使用下文中的PHP采集规则,亲测可以使用。
测试环境:php7.0;服务器配置:4核8G10M服务器;
提示:服务器过差时,或Nginx下,偶尔会跳504错误;
测试截图
caiji.php代码
<?php error_reporting(E_ALL^E_NOTICE^E_WARNING); ob_end_clean(); ob_implicit_flush(1); header("Content-type: text/html; charset=utf-8"); ini_set("max_execution_time", "0"); ini_set('memory_limit', '4000M'); date_default_timezone_set('Asia/Shanghai'); if (!file_exists ('data/')) { mkdir ( "data/", 0777, true ); } if (! file_exists ( 'data/juzi/' )) { mkdir ( "data/juzi/", 0777, true ); } delFile(dirname(__FILE__) . "/data/juzi/");//清空文件夹 $count = 20; //一次采集多少篇,建议不要太多 /** * 温馨提示:不要一下子采集太多,建议只采集两个地址的采集模块,不需要采集的先注释掉即可,例如:网易 */ //chinanews //getURL2(); //getcontent2(); //网易 //getURL(); //getcontent(); //人民网 //getURLpeople(); //getcontentpeople(); //深圳热线 //getURLszonline(); //getcontentszonline(); //人民政府 getURLgov(); getcontentgov(); //华龙网 //getURLcqnews(); //getcontentcqnews(); //华商新闻 //getURLhsw(); //getcontenthsw(); //深圳新闻网 //getURLsznews(); //getcontentsznews(); // //IT之家滚动新闻 //getURLithome(); //getcontentithome(); //东南网 //getURLfjsen(); //getcontentfjsen(); //温州新闻网 //getURL66wz(); //getcontent66wz(); //金羊网 //getURLycwb(); //getcontentycwb(); exit('本次采集完毕!'); /////////////////////////////////chinanews/////////////////////////////////////// function getURL2(){ $url = 'http://www.chinanews.com/scroll-news/news1.html'; preg_match_all('/class="dd_bt"><a.*?href="(.+?)">(.+?)</is',file_get_contents($url), $_array,PREG_SET_ORDER); $links = is_file('chinanews.php') ? include 'chinanews.php' : array(); foreach($_array as $k=>$v){ if(!in_array($v[1].'|0',$links) && !in_array($v[1].'|1',$links)){ if(!strpos($v[1],'/shipin/')){ $links[] = 'http:'.$v[1].'|0'; } } } if(count($links) > 5000){ unlink('chinanews.php'); }else{ setinc($links,'chinanews.php'); } } function getcontent2(){ global $count; $links = is_file('chinanews.php') ? include 'chinanews.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); preg_match('/name="newstitle".*?value=\'(.+?)\'\/>/',$res,$title); $title = strToGBK($title[1]); $body = getSubstr($res,'<!--正文start-->','<!--正文start-->'); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $body); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = strToGBK(strtr_words(strip_tags($body,'<p>,<br>,<img>'))); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } setinc($links,'chinanews.php'); } /////////////////////////////////chinanews/////////////////////////////////////// /// /// /////////////////////////////////网易/////////////////////////////////////// function getURL(){ $url = 'http://news.163.com/special/0001220O/news_json.js'; $str = file_get_contents($url); $str = @iconv('GB2312','UTF-8//IGNORE',str_replace(array('var data=',';'),'',$str)); $json = json_decode($str,true); $links = is_file('163.php') ? include '163.php' : array(); foreach($json['news'][0] as $k=>$v){ if(!in_array($v['l'].'|0',$links) && !in_array($v['l'].'|1',$links)){ $links[] = $v['l'].'|0'; } } setinc($links,'163.php'); } function getcontent(){ global $count; $links = is_file('163.php') ? include '163.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = iconv('gb2312','utf-8//IGNORE',file_get_contents($url[0])); $title = strToGBK(getSubstr($res,'<h1>','</h1>')); preg_match('/id="endText".*?>(.+?)<div class="post_btmshare">/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('163.php'); }else{ setinc($links,'163.php'); } } /////////////////////////////////网易/////////////////////////////////////// /// /// /////////////////////////////////人民网/////////////////////////////////////// function getURLpeople(){ $url = 'http://news.people.com.cn/210801/211150/index.js'; $str = file_get_contents($url); $json = json_decode($str,true); $links = is_file('people.php') ? include 'people.php' : array(); foreach($json['items'] as $k=>$v){ if(!in_array($v['url'].'|0',$links) && !in_array($v['url'].'|1',$links)){ $links[] = $v['url'].'|0'; } } setinc($links,'people.php'); } function getcontentpeople(){ global $count; $links = is_file('people.php') ? include 'people.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = iconv('gb2312','utf-8//IGNORE',file_get_contents($url[0])); $title = strToGBK(getSubstr($res,'<h1>','</h1>')); preg_match('/text left-->(.+?)<div class="zdfy clearfix">/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('people.php'); }else{ setinc($links,'people.php'); } } /////////////////////////////////人民网/////////////////////////////////////// /// /// /////////////////////////////////深圳热线/////////////////////////////////////// function getURLszonline(){ $url = 'http://pinpai.szonline.net/gundongxinwen/'; $str = file_get_contents($url); preg_match_all('/ref="(.+?)"/is', getSubstr($str,'<ul>','</ul>'),$b); $links = is_file('szonline.php') ? include 'szonline.php' : array(); foreach($b[1] as $k=>$v){ if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'szonline.php'); } function getcontentszonline(){ global $count; $links = is_file('szonline.php') ? include 'szonline.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(getSubstr($res,'<h2>','</h2>')); preg_match('/class="content-imfomation">(.+?)<!--分页-->/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('szonline.php'); }else{ setinc($links,'szonline.php'); } } /////////////////////////////////深圳热线/////////////////////////////////////// /// /// /////////////////////////////////人民政府/////////////////////////////////////// function getURLgov(){ $url = 'http://www.gov.cn/xinwen/gundong.htm'; $str = file_get_contents($url); preg_match_all('/<h4>.*?href="(.+?)"/is', getSubstr($str,'<ul>','</ul>'),$b); $links = is_file('gov.php') ? include 'gov.php' : array(); foreach($b[1] as $k=>$v){ $v = 'http://www.gov.cn'.$v; if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'gov.php'); } function getcontentgov(){ global $count; $links = is_file('gov.php') ? include 'gov.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'<h1>','</h1>'))); preg_match('/id="UCAP-CONTENT">(.+?)<div class="editor">/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('gov.php'); }else{ setinc($links,'gov.php'); } } /////////////////////////////////人民政府/////////////////////////////////////// /// /// /////////////////////////////////华龙网/////////////////////////////////////// function getURLcqnews(){ $url = 'http://news.cqnews.net/rollnews/'; $str = file_get_contents($url); preg_match_all('/href="(.+?)"/is', getSubstr($str,'id="content">','class=\'pages\''),$b); $links = is_file('cqnews.php') ? include 'cqnews.php' : array(); foreach($b[1] as $k=>$v){ if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'cqnews.php'); } function getcontentcqnews(){ global $count; $links = is_file('cqnews.php') ? include 'cqnews.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'<h1 class="bigtitle">','</h1>'))); preg_match('/id="_h5_content" class="main_text">(.+?)<\/article>/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('cqnews.php'); }else{ setinc($links,'cqnews.php'); } } /////////////////////////////////华龙网/////////////////////////////////////// /// /// /////////////////////////////////华商新闻/////////////////////////////////////// function getURLhsw(){ $url = 'http://news.hsw.cn/sx08/sxgd/'; $str = file_get_contents($url); preg_match_all('/href="(.+?)"/is', getSubstr($str,'<ul class="list">','class="page">'),$b); $links = is_file('hsw.php') ? include 'hsw.php' : array(); foreach($b[1] as $k=>$v){ if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'hsw.php'); } function getcontenthsw(){ global $count; $links = is_file('hsw.php') ? include 'hsw.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'<h1>','</h1>'))); preg_match('/http:\/\/news.hsw.cn\/system\/.*?\/.*?\/(.+?)\.shtml/is',$url[0],$u); $body = file_get_contents('http://tags.hsw.cn/api.php?op=allcontent&catid=367&id='.$u[1]); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $body); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('hsw.php'); }else{ setinc($links,'hsw.php'); } } /////////////////////////////////华商新闻/////////////////////////////////////// /// /// /////////////////////////////////深圳新闻网/////////////////////////////////////// function getURLsznews(){ $url = 'http://news.sznews.com/node_150127.htm'; $str = file_get_contents($url); preg_match_all('/href="(.+?)"/is', getSubstr($str,'class="listw mt10">','class=\'pages\'>'),$b); $b[1] = array_unique($b[1]); $links = is_file('sznews.php') ? include 'sznews.php' : array(); foreach($b[1] as $k=>$v){ if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'sznews.php'); } function getcontentsznews(){ global $count; $links = is_file('sznews.php') ? include 'sznews.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'<h1 class="h1-news">','</h1>'))); preg_match('/<!-- Start:article -->(.+?)<!--\/enpcontent-->/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('sznews.php'); }else{ setinc($links,'sznews.php'); } } /////////////////////////////////深圳新闻网/////////////////////////////////////// /// /// /////////////////////////////////IT之家滚动新闻/////////////////////////////////////// function getURLithome(){ $url = 'https://www.ithome.com/list/'; $str = file_get_contents($url); preg_match_all('/<\/strong>.*?href="(.+?)"/is', getSubstr($str,'class="ulcl">','</ul>'),$b); $links = is_file('ithome.php') ? include 'ithome.php' : array(); foreach($b[1] as $k=>$v){ if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'ithome.php'); } function getcontentithome(){ global $count; $links = is_file('ithome.php') ? include 'ithome.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'<h1>','</h1>'))); preg_match('/id="paragraph">(.+?)<div class="con-recom adblock">/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('ithome.php'); }else{ setinc($links,'ithome.php'); } } /////////////////////////////////IT之家滚动新闻/////////////////////////////////////// /// /// /////////////////////////////////东南网/////////////////////////////////////// function getURLfjsen(){ $url = 'http://news.fjsen.com/RollingNews.htm'; $str = file_get_contents($url); preg_match_all('/href="http:\/\/www.fjsen.com\/(.+?)"/is', getSubstr($str,'class="list_page">','id="displaypagenum"'),$b); $links = is_file('fjsen.php') ? include 'fjsen.php' : array(); foreach($b[1] as $k=>$v){ $v = 'http://www.fjsen.com/' . $v; if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'fjsen.php'); } function getcontentfjsen(){ global $count; $links = is_file('fjsen.php') ? include 'fjsen.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'<h1>','</h1>'))); preg_match('/class="zw">(.+?)<!--\/enpcontent-->/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('fjsen.php'); }else{ setinc($links,'fjsen.php'); } } /////////////////////////////////东南网/////////////////////////////////////// /// /// /////////////////////////////////温州新闻网/////////////////////////////////////// function getURL66wz(){ $url = 'http://news.66wz.com/roll/'; $str = file_get_contents($url); preg_match_all('/href="(.+?)"/is', getSubstr($str,'class="newslist">','</ul>'),$b); $links = is_file('66wz.php') ? include '66wz.php' : array(); foreach($b[1] as $k=>$v){ if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'66wz.php'); } function getcontent66wz(){ global $count; $links = is_file('66wz.php') ? include '66wz.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = iconv('gb2312','utf-8//IGNORE',file_get_contents($url[0])); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'id="artibodytitle">','</h1>'))); preg_match('/id="artibody">(.+?)<!--enorth/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('66wz.php'); }else{ setinc($links,'66wz.php'); } } /////////////////////////////////温州新闻网/////////////////////////////////////// /// /// /////////////////////////////////金羊网/////////////////////////////////////// function getURLycwb(){ $url = 'http://news.ycwb.com/n_gd_jd.htm'; $str = file_get_contents($url); preg_match_all('/href="(.+?)"/is', getSubstr($str,'class="lists-box"','</ul>'),$b); $links = is_file('ycwb.php') ? include 'ycwb.php' : array(); foreach($b[1] as $k=>$v){ if(!in_array($v.'|0',$links) && !in_array($v.'|1',$links)){ $links[] = $v.'|0'; } } setinc($links,'ycwb.php'); } function getcontentycwb(){ global $count; $links = is_file('ycwb.php') ? include 'ycwb.php' : array(); if(empty($links)) return; $i = 0; foreach($links as $k=>$v){ $url = explode('|', $v); if($url[1] == '0' && $i <= $count){ $res = file_get_contents($url[0]); $title = strToGBK(str_replace(array("\r\n","\n"," "),"",getSubstr($res,'<title>','</title>'))); preg_match('/<!-- 正文 :: START -->(.+?)<!-- 正文 :: ENDED -->/is', $res,$b); $body = preg_replace('/<script[^>]*?>.*?<\/script>/is', "", $b[1]); $body = preg_replace('/<style[^>]*?>.*?<\/style>/is', "", $body); $body = preg_replace('/<!--.*?-->/is', "", $body); $body = strToGBK(strip_tags($body,'<p>,<br>,<img>')); $body = strtr_words(str_replace(array("\r\n","\n"),"",trim($body))); if(!empty($title) && !empty($body) && mb_strlen($body, 'UTF-8') > 200){ file_put_contents('data/juzi/'.msectime().$i.'.txt', $title.'|||||||'.$body); } $links[$k] = $url[0].'|1'; $i++; } } if(count($links) > 5000){ unlink('ycwb.php'); }else{ setinc($links,'ycwb.php'); } } /////////////////////////////////金羊网/////////////////////////////////////// function msectime() { list($msec, $sec) = explode(' ', microtime()); $msectime = (float)sprintf('%.0f', (floatval($msec) + floatval($sec)) * 1000); return $msectime; } function getSubstr($str, $leftStr, $rightStr) { $t1 = mb_strpos($str,$leftStr)+mb_strlen($leftStr); $str = mb_substr($str,$t1); $t2 = mb_strpos($str,$rightStr); return $s = mb_substr($str,0,$t2); } function setinc($data,$file){ $setfile = $file; $settingstr="<?php \t\n return array(\n"; foreach($data as $key=>$v){ $settingstr.="\t'".$key."'=>'".$v."',\n"; } $settingstr.=");\n?>\n"; file_put_contents($setfile,$settingstr); } function strToGBK($strText) { $encode = mb_detect_encoding($strText, array('UTF-8','UTF-8','UTF-8')); if($encode !== "UTF-8") { return @iconv('UTF-8','UTF-8//IGNORE',$strText); } else { return $strText; } } function strtr_words($str) { $words=array(); $content = file_get_contents('wei.txt');//词库 $content = str_replace( "\r", "",$content); $content = preg_split('/\n/', $content, -1, PREG_SPLIT_NO_EMPTY);//\n分割字符 foreach($content as $k=>$v) { if($k!=0) { $str_data = explode(',',$v);//关键词分割符 $words+= array("$str_data[0]"=>"$str_data[1]"); } } return strtr($str,$words);//返回结果 } function delFile($dirName){ if(file_exists($dirName) && $handle=opendir($dirName)){ while(false!==($item = readdir($handle))){ if($item!= "." && $item != ".."){ if(file_exists($dirName.'/'.$item) && is_dir($dirName.'/'.$item)){ delFile($dirName.'/'.$item); }else{ unlink($dirName.'/'.$item); } } } closedir( $handle); } }
本站所有内容仅限用于学习和研究目的,程序仅供本地断网测试,转载请说明出处!
站群工具网 » 可用的新闻采集caiji.php规则
站群工具网 » 可用的新闻采集caiji.php规则