1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 | <?php set_time_limit(0); index(); function index() { $list =getNext( "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2020/index.html" , '' ,1); file_put_contents ( 'area.json' ,json_encode( $list ,JSON_UNESCAPED_UNICODE)); } function getNext( $parent_url , $next_page , $level ) { usleep(20*1000); $isLast =0; $parent_url_arr = explode ( '/' , $parent_url ); unset( $parent_url_arr [ count ( $parent_url_arr )-1]); $parent_url_2 =implode( '/' , $parent_url_arr ); $next_url = $parent_url_2 . '/' . $next_page ; $html = file_get_contents ( $next_url ); if (! $html ){ echo '超时' . "\n" ; sleep(10); $html = file_get_contents ( $next_url ); if ( $html ){ echo '继续' . "\n" ; } } $html =iconv( 'GB2312' , 'UTF-8' , $html ); if ( $level ==1){ preg_match_all( "/<a href='(.*?\.html)'>(.*?)</" , $html , $r ); } else { preg_match_all( "/<td><a href='(.{1,50}\.html)'>(.*?)<\/a><\/td><td><a href='.*?'>(.*?)<\/a/" , $html , $r ); } if ( count ( $r [0])==0){ preg_match_all( "/<tr class='.{1,20}'><td>(.{1,20})<\/td><td>(.{1,20})<\/td><td>(.{1,30})<\/td><\/tr>/" , $html , $r ); $isLast =1; } if ( $level ==1){ $list [ 'href' ]= $r [1]; $list [ 'name' ]= $r [2]; $list [ 'code' ]= '' ; $list [ 'cityclass' ]= '' ; } else { if ( $isLast ==0){ $list [ 'href' ]= $r [1]; $list [ 'name' ]= $r [3]; $list [ 'code' ]= $r [2]; $list [ 'cityclass' ]= '' ; } else { $list [ 'href' ]=[]; $list [ 'name' ]= $r [3]; $list [ 'code' ]= $r [1]; $list [ 'cityclass' ]= $r [2]; } } $list [ 'level' ]= $level ; $list [ 'list' ]=[]; if ( $level <=3){ // 不用爬太深 foreach ( $list [ 'href' ] as $key => $v ){ if ( $list [ 'level' ]==1){ echo $list [ 'name' ][ $key ]. "\n" ; } $data =[ 'code' => $list [ 'code' ][ $key ], 'cityclass' => $list [ 'cityclass' ][ $key ], 'name' => $list [ 'name' ][ $key ], 'children' =>[] ]; $data [ 'children' ]=getNext( $next_url , $list [ 'href' ][ $key ], $level +1); $list [ 'list' ][]= $data ; } } return $list [ 'list' ]; } |
用PHP命令行执行,然后坐等生成json文件就行。