php采集器分页版

PHP　2011/6/13 18:06:24　　点击：不统计

原载于:转www.载for网站制作学习asp必.cn究

php采集器带分页功能，本站原创，解释已经写好，正则部分是从数据库中读出来的。

<?
set_time_limit(60);//设置超时时间
function formate_url($gcurl,$url){
if(substr($url,0,7)=="http://")return $url;
$url_a = explode("/",$gcurl);
//var_dump($url_a);
if(substr($url,0,1)=="/"){
  $tempurl = "http://".$url_a[2];
  return $tempurl.$url;
}else{
  for($t=0;$t<count($url_a)-1;$t++)
  $tempurl.=$url_a[$t]."/";
  return $tempurl.$url;
}
}
function de_($str){
$str = str_replace("/","\/",$str);
$str = addslashes($str);
$str = str_replace("\\\\","\\",$str);
return $str;
}
$btime = time();
?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-

transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
<title>采集器-php采集器带分页功能采集器</title>
</head>
<script language="javascript" src="../js/jquery-1.4.4.min.js"></script>
<script language="javascript">
$(function(){

});
function change_state(num,ge,left){
$("#info").html("总共需要采集"+num+"篇，当前正在采集第"+ge+"个,剩余<font color='#ff0000'>"+left+"</font>");
temp = parseInt(ge /num * 100);
$("#baifenbi").text( temp+ "%");
$("#yanse").css("width",temp+"%");
}
</script>
<body style="font-size:12px;">
<table width="100%">
<tr><td><div id="load">初始化中请等待...</div></td></tr>
</table>
<table width="100%">
<tr>
<td width="48%" height="23">

</td>
<td width="52%"><div id="baifenbi"></div></td>
</tr>
</table>
<table width="100%">
<tr><td>
<div style=" width:100%; height:25;" id="info"></div>
</td></tr>
</table>
<?
//进行初始化获取对应的标签信息
$gcid = htmldecode($_GET["gcid"]);
$beginid = htmldecode($_GET["beginid"]);
if(empty($beginid))$beginid=0;
if(empty($gcid)||!is_numeric($gcid))err_back_message("参数错误，缺少操作对象!");
$sql ="select title,cfrom,code,gcurl,gc_b,gc_e,gcu_b,gcu_e,c_b,c_e,other,addtime from gc_rule where delflag = 3

and gcid=".$gcid;
$db->execute($sql);
$num = $db->num_rows();
if($num>0){
$rows = $db->fetch_array();
$title = htmlspecialchars_decode($rows["title"]);//获取标题
$cfrom = htmlspecialchars_decode($rows["cfrom"]);//获取来源
$code = htmlspecialchars_decode($rows["code"]);//这里是获取utf-8 还是gbk
$gcurl = htmlspecialchars_decode($rows["gcurl"]);//这里是获取初始连接
$gcu_b = de_(htmlspecialchars_decode($rows["gcu_b"]));//这里是分页的正则开始
$gcu_e = de_(htmlspecialchars_decode($rows["gcu_e"]));//分页的正则结束，以下一页为最后一个内容链接，
$gc_b = de_(htmlspecialchars_decode($rows["gc_b"]));//分页列表开始，以a为主
$gc_e = de_(htmlspecialchars_decode($rows["gc_e"]));//分页列表结束，目的为获取内容的链接地址
$c_b = de_(htmlspecialchars_decode($rows["c_b"]));//获取内容正则开始
$c_e = de_(htmlspecialchars_decode($rows["c_e"]));//获取内容正则结束
$other = htmlspecialchars_decode($rows["other"]);//备注信息
$db->free_result();
}else{
$db->free_result();
err_back_message("参数错误，不存在规则!");
}
$nextgcurl = urldecode($_GET["nextgcurl"]);//这里判断是获取所有链接地址是否是空，如果是空则，表示第一次进行查询，

如果不是，则正在查询某个页面的链接
$gcurl = empty($nextgcurl)?$gcurl:$nextgcurl;
@$maincontent = file_get_contents($gcurl);//这里是设置获取页面的所有连接
if(!$maincontent)exit("不支持采集功能!");
if($code=='utf-8')$maincontent = iconv("utf-8","gbk",$maincontent);

$a_gc_rule = "/<a(.*?)href=[\"|'](.*?)[\"|'](.*?)>(.*?)<\/a>/i";//\"([\s\S]*)[.html|.shtml|.htm]{1}\"$//(?

<=href=)([^\>]*)(?=\>)/

if(!empty($gcu_b)&&!empty($gcu_e)){
///开始正则匹配获取下一页的链接地址
$gcu_rule = "/".$gcu_b."([\s\S]*)".$gcu_e."/";
preg_match_all($gcu_rule,$maincontent,$next_array);
preg_match_all($a_gc_rule,$next_array[1][0],$next_url_array);
//获取上面的内容后获取最后一个超级连接、 <a(.*?)href=[\"|'](.*?)[\"|'](.*?)>(.*?)<\/a>
$next_url = formate_url($gcurl,$next_url_array[2][(count($next_url_array[2])-1)]);
}else{
$next_url = "";
}

?>
<script language="javascript">

</script>
<?
flush();
////开始计算采集数量//进行正则匹配a
$gc_rule = "/".$gc_b."([\s\S]*)".$gc_e."/";
preg_match_all($gc_rule,$maincontent,$array);
$a_content = $array[1][0];

$link_array = array();
$title_array=array();
preg_match_all($a_gc_rule,$a_content,$array);
for($i=0;$i<count($array[2]);$i++){
if(!in_array($array[2][$i],$link_array)){
$link_array[]=formate_url($gcurl,$array[2][$i]);
$title_array[]=$array[4][$i];
}
}
//var_dump($title_array);
//exit;
$link_num = count($link_array);
//开始采集Forasp.cn
for($i=$beginid;$i<$link_num;$i++){
   echo "开始采集:".$link_array[$i];

   $sql = "select oldurl from gc where oldurl = '".$link_array[$i]."' order by gcrid desc";
   $num = $db->get_rows($sql);
if($num<=0){
?>
<script language="javascript">

</script>
<?
@$content_all = file_get_contents($link_array[$i]);
if($code=='utf-8')$content_all = iconv("utf-8","gbk",$content_all);

$content_rule = "/".$c_b."([\s\S]*)".$c_e."/"; //主要内容的正则表达式
$title = $title_array[$i]; //获取title
preg_match_all($content_rule,$content_all,$content_a);
$content = $content_a[1][0]; //获取内容www-fo-a-sp.cn
if($content!=""){
$sql = "insert into gc(gcrid,title,content,newstime,oldurl)values(".$gcid.",'".$title."','".htmlspecialchars

(addslashes($content))."',".date("U").",'".$link_array[$i]."')";
@$num = $db->insert($sql);
}else{
   $num=0;
}
flush();
echo "<br>已经采集:".$title."   ".($num>0?"<font color=green>成功</font>!":"<font color=red>失败</font>!");
echo "<br>地址：".$link_array[$i]."<br><hr style='height:1px'>";
}else{
echo "<br><font color=green>采集过</font>,<font color=red>放弃!</font></a>";
echo "<br>地址：".$link_array[$i]."<br><hr style='height:1px'>";
}
if((time()-$btime)>20){//防止超时跳转
   $db->close();
///这里是正在采集某个页面，如果，不为空则继续采集本站原创 www.forasp.cn

?>
<script language="javascript">
<!--
$("#load").html("需要分页采集，请等待跳转...");
window.location="?gcid="+<?php echo $gcid;?>+"&beginid="+<?php echo $i+1;?>+"&nextgcurl=<?php echo urlencode

($gcurl);?>";
-->
</script>
<?
exit;
}
}

$db->close();//

//采集某个页面完毕，当下一页不为空，则进行跳转至下一页
if(!empty($next_url)&&$next_url!=$gcurl){
?>
<script language="javascript">

</script>
<?
}
?>
<br>

<网f站o学a习s制p作.cn>

·上一篇：PHP分页代码 >> 　　　·下一篇：PHP获取关键词代码 >>