【笔记】PHP联网抓取城市4级联动数据
本帖最后由 奋斗丶小Z 于 2016-3-21 22:51 编辑环境:wamp
编辑器:sublime
抓取地址:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2014/index.html
拿了之前代码看下,个人比较懒,都是改原来的代码,大概这个意思
首先,配置apache,编辑httpd-vhost,新建一个虚拟主机绑定域名为1.com(测试用),重启apache服务。编辑本机host文件,添加127.0.0.1 1.com
基本工作完成,开始敲一些代码
顺手写了一个类,这个不是必须的,类的结构没做优化,随心所欲的写吧
<?php
class getCity {
private $url = null;
private $content = null;
function __construct($url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2014/index.html") {
$this->url = $url;
}
//获取省级城市
function getContent() {
$str = file_get_contents($this->url);
$this->content = mb_convert_encoding($str,'utf-8','gb2312');
}
//设置当前URL
function setUrl($url) {
$this->url = $url;
}
//显示当前网页内容
function showContent() {
echo $this->content;
}
function getProvincetr() {
$provincetrs = null;
preg_match_all("/<tr class='provincetr'>(.*?)<\/tr>/",$this->content,$provincetrs);
$strProvincetrs = implode($provincetrs);
preg_match_all("/<td><a href='(.*?)'>(.*?)<br\/><\/a><\/td>/",$strProvincetrs,$arrProvincetrs, PREG_SET_ORDER);
foreach($arrProvincetrs as $k=>&$v) {
unset($v);
}
return $arrProvincetrs;
}
function judge($url) {
$str = file_get_contents($url);
$content = mb_convert_encoding($str,'utf-8','gb2312');
if(strpos($content,'市辖区</a></td>')){
return true;
}else{
return false;
}
}
/**获取市级
*
**/
function citytr($url) {
$str = file_get_contents($url);
$str = mb_convert_encoding($str,'utf-8','gb2312');
preg_match_all("/<tr class='citytr'>(.*?)<\/tr>/",$str,$citytrs);
preg_match_all("/<td><a href='(.*?)'>(.*?)<\/a><\/td>/",$citytrs,$citytr,PREG_SET_ORDER);
$strCity = implode($citytrs);
preg_match_all("/<td><a href='(.*?)'>(.*?)<\/a><\/td>/",$strCity,$citytrs,PREG_SET_ORDER);
//过滤成想要的格式.只要偶数列
foreach($citytrs as $key => $value) {
if($key%2) {
unset($value);
$value = array_values($value);
$citytrs['mystyle'][] = $value;
}
}
return $citytrs['mystyle'];
}
//获取县级数据
function getcountytr($url) {
$str = file_get_contents($url);
$str = mb_convert_encoding($str,'utf-8','gb2312');
preg_match_all("/<tr class='countytr'>(.*?)<\/tr>/",$str,$citytrs);
unset($citytrs);
if(strpos($citytrs,'<td>市辖区')){
unset($citytrs);
}
$strCounty = implode($citytrs);
preg_match_all("/<td><a href='(.*?)'>(.*?)<\/a><\/td>/",$strCounty,$county,PREG_SET_ORDER);
//var_dump($citytrs);
foreach($county as $key => &$value) {
unset($value);
if(($key%2) == 0){
unset($county[$key]);
}
}
return $county;
}
//获取街道数据
function gettowntr($url) {
$str = file_get_contents($url);
$str = mb_convert_encoding($str,'utf-8','gb2312');
preg_match_all("/<tr class='towntr'>(.*?)<\/tr>/",$str,$citytrs);
$strTown = implode($citytrs);
preg_match_all("/<td><a href='(.*?)'>(.*?)<\/a><\/td>/",$strTown,$arrtowns,PREG_SET_ORDER);
foreach($arrtowns as $key=>&$value) {
unset($value);
if($key%2==0){
unset($arrtowns[$key]);
}
}
return $arrtowns;
}
//获取办事处
function getVillagetr($url) {
$str = file_get_contents($url);
if(strlen($str)<50){
return 0;
}
$str = mb_convert_encoding($str,'utf-8','gb2312');
preg_match_all("/<tr class='villagetr'>(.*?)<\/tr>/",$str,$citytrs);
$strVill = implode($citytrs);
preg_match_all("/<tr class='villagetr'><td>.*?<\/td><td>.*?<\/td><td>(.*?)<\/td><\/tr>/",$strVill,$arrVill);
return $arrVill;
}
}
?>
为了方便,我一次抓取一级然后写入数据库,下一次从数据库读取上一级,并且开始抓取下一级写入数据库
数据库字段包括:id,name,url,fid,path,level
因为后期对数据库可能有更多需求,所以这里path,level,fid三个字段全都写上了,市级上只要path或者只要level和fid就可以实现无限分类的
代码很简单,就是一层一层的抓取
举个例子
<?php
require "getCity.class.php";
require "common.func.php";
$baseUrl = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2014/';
$city = new getCity();
$city->getContent();
$arr = $city->getProvincetr();
//获取市级放入数组arr
foreach($arr as $key=>&$value) {
$currentUrl = $baseUrl.$value;
//获取市的列表,如果是直辖市获取的是县,市辖区两个字段
$citytr = $city->citytr($currentUrl);
$value['next'] = $citytr;
//获取县级数据
//var_dump($value);
foreach($value['next'] as $k => &$v) {
$ctr = $city->getcountytr($baseUrl.$v);
$v['next'] = $ctr;
//获取街道数据
foreach($v['next'] as $kkk => &$vvv) {
$path1 = explode('/',$v);
$town = $city->gettowntr($baseUrl.$path1.'/'.$vvv);
$vvv['next'] = $town;
//获取办事处
foreach($vvv['next'] as $kkkk=>&$vvvv) {
$path2 = explode('/',$vvv);
$vill = $city->getVillagetr($baseUrl.$path1.'/'.$path2.'/'.$vvvv);
$vvvv['next'] = $vill;
}
}
var_dump($v);
}
}
$endstr = json_encode($value);
file_put_contents('1.txt', $endstr);
var_dump("输出完成");
?>
这样,代码执行完毕,所有数据都抓取出来了(但是,网速原因,部分页面会打不开,找个地方记录没有打开的网址就可以了,我这里有十几个的样子)
可以自己写代码逐层抓取,这里写个抓取市级的,并且写入数据库(上一级信息从数据库查询)
执行过程的定义的输出信息还是越少越好,最好使用ajax动态从数据库查询,否则浏览器容易崩溃
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Document</title>
<script src="http://cdn.bootcss.com/jquery/1.11.3/jquery.min.js"></script>
</head>
<body>
<div id="aaa"></div>
<script>
var prenum=null;
var msg=null;
function getdata() {
$.get("2.php",function(data){
prenum = parseInt(data);
if(parseInt(prenum) == parseInt(data)){
msg = "<div style='color:red;'>正在等待数据库更新条目。。。</div>"
}else{
msg="<div style='color:red;'>数据库更新延迟,有可能网卡了。。。</div>";
}
data = "已经写入数据库条目:"+data+"<br>"+msg;
$("#aaa").html(data);
});
}
</script>
<script>
setInterval("getdata()",1000);
</script>
</body>
</html>
最后抓取后查询到的数据信息
这个到底有什么用? 没明白是干什么用的~! 没明白,看的晕 学习学习呢 前排座椅 你这干嘛用的? 我是菜鸟,弱弱的问一句,这个软件有什么用啊 弱弱的问一句 这有啥用 大神。。。
顿时觉得自己是个渣渣,
刚好要用到,拿来参考
感谢分享,吾爱因你更精彩
页:
[1]
2