<?php
set_time_limit(0);
index();
function index()
{
$list=getNext("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html",'',1);
file_put_contents('area.json',json_encode($list,JSON_UNESCAPED_UNICODE));
}
function getNext($parent_url,$next_page,$level)
{
usleep(20*1000);
$isLast=0;
$parent_url_arr=explode('/',$parent_url);
unset($parent_url_arr[count($parent_url_arr)-1]);
$parent_url_2=implode('/',$parent_url_arr);
$next_url=$parent_url_2.'/'.$next_page;
$html=file_get_contents($next_url);
if (!$html){
echo '超时'."\n";
sleep(10);
$html=file_get_contents($next_url);
if ($html){
echo '继续'."\n";
}
}
$html=iconv('GB2312', 'UTF-8', $html);
if ($level==1){
preg_match_all("/<a href='(.*?\.html)'>(.*?)</",$html,$r);
}else{
preg_match_all("/<td><a href='(.{1,50}\.html)'>(.*?)<\/a><\/td><td><a href='.*?'>(.*?)<\/a/",$html,$r);
}
if (count($r[0])==0){
preg_match_all("/<tr class='.{1,20}'><td>(.{1,20})<\/td><td>(.{1,20})<\/td><td>(.{1,30})<\/td><\/tr>/",$html,$r);
$isLast=1;
}
if ($level==1){
$list['href']=$r[1];
$list['name']=$r[2];
$list['code']='';
$list['cityclass']='';
}else{
if ($isLast==0){
$list['href']=$r[1];
$list['name']=$r[3];
$list['code']=$r[2];
$list['cityclass']='';
}else{
$list['href']=[];
$list['name']=$r[3];
$list['code']=$r[1];
$list['cityclass']=$r[2];
}
}
$list['level']=$level;
$list['list']=[];
if ($level<=3){ // 不用爬太深
foreach ($list['href'] as $key=>$v){
if ($list['level']==1){
echo $list['name'][$key]."\n";
}
$data=[
'code'=>$list['code'][$key],
'cityclass'=>$list['cityclass'][$key],
'name'=>$list['name'][$key],
'children'=>[]
];
$data['children']=getNext($next_url,$list['href'][$key],$level+1);
$list['list'][]=$data;
}
}
return $list['list'];
}
用PHP命令行执行,然后坐等生成json文件就行。