抓取就是采集了,今天因为我们要做一个实时采集新浪新闻的功能,下面整理了一个非常了得的php 抓取新浪新闻的程序,我们来看看,首先,需要下载一个simple_html_dom第三方扩展库,具体下载方式和使用详情可以查看:simple_html_dom的使用,需要环境支持file_get_contents()函数和curl的支持,具体代码如下: include _once('simple_html_dom.php'); $ch = curl_init(); curl_setopt ($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, false); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); $output = curl_exec($ch); curl_close($ch); $html = new simple_html_dom(); $html->load($output); $images = array(); $arr = array(); foreach ($html->find('li a') as $element) { if ( preg_match ('#^http://tech.sina.com.cn/it/[d]{4}-[d]{1,2}-[d]{1,2}/[d]+.shtml$#i', $element->href)) { array_push($images, $element->href); } } $images = array_unique($images); sort($images); for ($i = 0; $i load($data); $arr = array(); foreach ($html->find('h1#artibodyTitle') as $element) { $arr['title'] = @ iconv ('gbk', 'utf-8', $element->innertext);; } $str = ''; foreach ($html->find('div#artibody p') as $element) { $str.= $element; } $arr['content'] = $str; foreach ($html->find('div.img_wrapper img') as $element) { $arr['alt'] = $element->alt; $data = file_get_contents($element->src); $info = getimagesize($element->src); //get image information switch ($info[2]) { case 1: $str = 'gif'; break; case 2: $str = 'jpg'; break; case 3: $str = 'png'; break; default: continue; break; } $filename = time() . rand(1, 999999) . '.' . $str; if (!is_dir($dirname)) { mkdir($dirname, 0777, true); } $fp = fopen ($dirname . $filename, 'w'); fwrite($fp, $data); fclose($fp); $arr['img'] = $dirname . $filename; } return $arr;}登录后复制 永久链接:转载随意!带上文章地址吧。
09-14 11:33