<?php set_time_limit(0); index(); function index() { $list=getNext("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html",'',1); file_put_contents('area.json',json_encode($list,JSON_UNESCAPED_UNICODE)); } function getNext($parent_url,$next_page,$level) { usleep(20*1000); $isLast=0; $parent_url_arr=explode('/',$parent_url); unset($parent_url_arr[count($parent_url_arr)-1]); $parent_url_2=implode('/',$parent_url_arr); $next_url=$parent_url_2.'/'.$next_page; $html=file_get_contents($next_url); if (!$html){ echo '超时'."\n"; sleep(10); $html=file_get_contents($next_url); if ($html){ echo '继续'."\n"; } } $html=iconv('GB2312', 'UTF-8', $html); if ($level==1){ preg_match_all("/<a href='(.*?\.html)'>(.*?)</",$html,$r); }else{ preg_match_all("/<td><a href='(.{1,50}\.html)'>(.*?)<\/a><\/td><td><a href='.*?'>(.*?)<\/a/",$html,$r); } if (count($r[0])==0){ preg_match_all("/<tr class='.{1,20}'><td>(.{1,20})<\/td><td>(.{1,20})<\/td><td>(.{1,30})<\/td><\/tr>/",$html,$r); $isLast=1; } if ($level==1){ $list['href']=$r[1]; $list['name']=$r[2]; $list['code']=''; $list['cityclass']=''; }else{ if ($isLast==0){ $list['href']=$r[1]; $list['name']=$r[3]; $list['code']=$r[2]; $list['cityclass']=''; }else{ $list['href']=[]; $list['name']=$r[3]; $list['code']=$r[1]; $list['cityclass']=$r[2]; } } $list['level']=$level; $list['list']=[]; if ($level<=3){ // 不用爬太深 foreach ($list['href'] as $key=>$v){ if ($list['level']==1){ echo $list['name'][$key]."\n"; } $data=[ 'code'=>$list['code'][$key], 'cityclass'=>$list['cityclass'][$key], 'name'=>$list['name'][$key], 'children'=>[] ]; $data['children']=getNext($next_url,$list['href'][$key],$level+1); $list['list'][]=$data; } } return $list['list']; }
用PHP命令行执行,然后坐等生成json文件就行。