跳转到主要内容
月亮不孤主 提交于 11 August 2012

版本: DRUPAL 7

要采集的采集的页面:http://drupalgarden.cn/forum/1202.html

 

代码:~

 

<?php function create_node($title,$uid,$body,$type){ $node->is_new=1; $node->title = $title; $node->uid =$uid; $node->type=$type; $node->body['und'][0]['value']= $body; node_save($node); print $node->nid; } function _get_contents($url,$img = FALSE){ $dir = pathinfo($url); $host = $dir['dirname']; $refer = $host.'/'; $ch = curl_init(); $user_agent = "Baiduspider+(+http://www.baidu.com/search/spider.htm)"; //伪装百度蜘蛛 curl_setopt ($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt ($ch, CURLOPT_REFERER, $refer); curl_setopt ($ch, CURLOPT_TIMEOUT, 800); curl_setopt($ch, CURLOPT_USERAGENT, $user_agent); $file_contents = curl_exec($ch); if($img == TRUE){ $content_length = curl_getinfo($ch,CURLINFO_CONTENT_LENGTH_DOWNLOAD); $size = round($content_length / 1024, 2); return $size; }else{ curl_close($ch); return $file_contents; } } $url="http://drupalgarden.cn/forum/1202.html"; $content=_get_contents($url,$img = FALSE); preg_match_all('/div class="forum-post-content">(.*?)<\/div>/is',$content, $a); preg_match_all('/

(.*?)<\/h2>/is',$content, $t); $body=$a[1][0]; $title=$t[1][0]; $type='article'; $uid=1; create_node($title,$uid,$body,$type); ?>

你可以继续根据nid 弄个foreach 或者FOR 循环,你可以把龙马的论坛在5分钟内全部采集过来,龙马可别揍我:)

标签