Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

更新 #8

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion areas.js

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion cities.js

Large diffs are not rendered by default.

72 changes: 72 additions & 0 deletions extra.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
module.exports = {
"710000": {
"710100000000": "香港",
"710200000000": "九龙",
"710300000000": "新界"
},
"710100000000": {
"710101000000": "中西区",
"710102000000": "东区",
"710103000000": "湾仔区",
"710104000000": "南区",
},
"710200000000": {
"710203000000": "九龙城区",
"710204000000": "观塘区",
"710205000000": "深水埗区",
"710206000000": "黄大仙区",
"710207000000": "油尖旺区",
},
"710300000000": {
"710301000000": "离岛区",
"710302000000": "葵青区",
"710303000000": "北区",
"710304000000": "西贡区",
"710305000000": "沙田区",
"710306000000": "屯门区",
"710307000000": "大埔区",
"710308000000": "荃湾区",
"710309000000": "元朗区",
},
"720000": {
"720100000000": "澳门半岛",
"720200000000": "离岛",
},
"720100000000": {
"720101000000": "大堂区",
"720102000000": "风顺堂区",
"720103000000": "花地玛堂区",
"720104000000": "花王堂区",
"720105000000": "望德堂区",
},
"720200000000": {
"720201000000": "嘉模堂区",
"720202000000": "路氹填海区",
"720203000000": "圣方济各堂区",
},
"730000": {
"730100000000": "台湾省"
},
"730100000000": {
"730101000000": "台北市",
"730102000000": "新北市",
"730103000000": "桃园市",
"730104000000": "台中市",
"730105000000": "台南市",
"730106000000": "高雄市",
"730107000000": "基隆市",
"730108000000": "新竹市",
"730109000000": "嘉义市",
"730110000000": "新竹县",
"730111000000": "苗栗县",
"730112000000": "彰化县",
"730113000000": "南投县",
"730114000000": "云林县",
"730115000000": "嘉义县",
"730116000000": "屏东县",
"730117000000": "宜兰县",
"730118000000": "花莲县",
"730119000000": "台东县",
"730120000000": "澎湖县",
},
}
186 changes: 83 additions & 103 deletions format.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,10 @@ const { timeout, writeFileSync } = require('./utils');
const provinces = require('./provinces');
const cities = require('./cities');
const areas = require('./areas');
const extra = require('./extra');
const pcodes = Object.keys(provinces['86']);

/**
/**
* 四个直辖市会将「市辖区」作为二级行政区域
* 重庆市会将「县」作为二级行政区域
* 河北省/河南省/湖北省/海南省 等省份会将「省直辖县级行政区划」作为第二级行政区域
Expand All @@ -23,67 +24,51 @@ const pcodes = Object.keys(provinces['86']);
const filter = ['市辖区', '县', '省直辖县级行政区划', '自治区直辖县级行政区划'];

// 省市
const pca = {
var pca = {
'86': provinces['86']
};
// 删除港澳
delete pca['86']['910000'];

// 省市区
const pcaa = {
var pcaa = {
'86': provinces['86']
};

// 提取行政区域 code
const reg = /0(?=0{2,})/;
const target = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{route}.html';
const target = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/#{route}.html';

const spinner = ora({
color: 'yellow'
});


function formatCode (code, text = '') {
// 特殊处理东莞市和中山市的县区数据 code
if(text === '东莞市' || text === '中山市') {
return code.slice(0, -3);
}
const index = reg.exec(code)['index'];
return index > 6 ? code.slice(0, index) : code.slice(0, 6);
}

// 省市联动
function formatPCAddress () {
pcodes.forEach(pcode => {
if (pcode === '710000') {
// 台湾
pca[pcode] = provinces['710100'];
} else if (pcode === '910000') {
// 港澳
pca['86']['810000'] = '香港特别行政区';
pca['86']['820000'] = '澳门特别行政区';
pca['810000'] = provinces['810000'];
pca['820000'] = provinces['820000'];
// const t = provinces[pcode];
// Object.keys(t).forEach(item => {
// pca[item] = provinces[item];
// });
} else {
const res = {};
const pcities = cities.filter(city => city.parentCode === pcode);
pcities.forEach(city => {
if (filter.includes(city.text)) {
// 用第三级区域数据补充
const tmps = areas.filter(area => area.parentCode === city.code);
tmps.forEach(tmp => {
res[formatCode(tmp.code)] = tmp.text.indexOf('办事处') > -1 ? tmp.text.slice(0, -3) : tmp.text;
})
} else {
res[formatCode(city.code)] = city.text;
}
});
pca[pcode] = res;
let res = {};
const pcities = cities.filter(city => city.parentCode === pcode);
pcities.forEach(city => {
if (filter.includes(city.text)) {
// 用第三级区域数据补充
const tmps = areas.filter(area => area.parentCode === city.code);
tmps.forEach(tmp => {
res[tmp.code] = tmp.text.indexOf('办事处') > -1 ? tmp.text.slice(0, -3) : tmp.text;
})
} else {
res[city.code] = city.text;
}
});
// 香港,澳门,台湾.
if (pcode.substr(0, 1) === '7') {
if (pcode === '730000') {
// 台湾
res = extra['730100000000'];
}
else {
// 港澳
res = extra[pcode];
}
}
pca[pcode] = res;
});
writeFileSync('pca.js', pca);
}
Expand All @@ -93,8 +78,8 @@ let url = '';
async function getAreasByCCode (page, code, text) {
const pCode = code.substr(0, 2);
const cCodeSuffix = code.substr(2, 2);

url = target.replace('#{route}', `${pCode}/${cCodeSuffix}/${code}`);
const filter_code = code.substr(0, 6);
url = target.replace('#{route}', `${pCode}/${cCodeSuffix}/${filter_code}`);
await page.goto(url);
let res = [];

Expand Down Expand Up @@ -128,72 +113,67 @@ async function formatPCAAddress () {
const f = filter.slice(1);
for (let p = 0, pl = pcodes.length; p < pl; p++) {
const pcode = pcodes[p];
if (pcode === '710000') {
// 台湾
pcaa[pcode] = provinces[pcode];
pcaa['710100'] = provinces['710100'];
} else if (pcode === '910000') {
// 港澳
const t = provinces[pcode];
pcaa[pcode] = t;
Object.keys(t).forEach(item => {
pcaa[item] = provinces[item];
});
} else {
const res = {};
const pcities = cities.filter(city => city.parentCode === pcode);
for(let c = 0, cl = pcities.length; c < cl; c++) {
const pcity = pcities[c];
const pareas = areas.filter(area => area.parentCode === pcity.code);

if (f.includes(pcity.text)) {
// 用第三级区域数据补充到第二级
for(let i = 0, l = pareas.length; i < l; i++) {
const pCurAreas = {};
const parea = pareas[i];
const code = formatCode(parea.code);
res[code] = parea.text.indexOf('办事处') > -1 ? parea.text.slice(0, -3) : parea.text;

// 抓取第三级数据
let [err, data] = await awaitTo(getAreasByCCode(page, code, res[code]));
if (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
[err, data] = await awaitTo(getAreasByCCode(page, code, res[code]));
}
spinner.succeed(chalk.green(`市级城市 ${res[code]} 的县区数据抓取完毕.`));
if (data.length) {
console.log('ddddd', data[0]);
data.forEach(item => {
if (item.text !== '市辖区') {
pCurAreas[formatCode(item.code)] = item.text.indexOf('办事处') > -1 ? item.text.slice(0, -3) : item.text;
}
});
pcaa[code] = pCurAreas;
}
await timeout(1500);

const res = {};
const pcities = cities.filter(city => city.parentCode === pcode);
for(let c = 0, cl = pcities.length; c < cl; c++) {
const pcity = pcities[c];
const pareas = areas.filter(area => area.parentCode === pcity.code);

if (f.includes(pcity.text)) {
// 用第三级区域数据补充到第二级
for(let i = 0, l = pareas.length; i < l; i++) {
const pCurAreas = {};
const parea = pareas[i];
const code = parea.code;
res[code] = parea.text.indexOf('办事处') > -1 ? parea.text.slice(0, -3) : parea.text;

// 抓取第三级数据
let [err, data] = await awaitTo(getAreasByCCode(page, code, res[code]));
while (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
[err, data] = await awaitTo(getAreasByCCode(page, code, res[code]));
}
} else {
const curAreas = {};
const cityCode = formatCode(pcity.code);
spinner.succeed(chalk.green(`市级城市 ${res[code]} 的县区数据抓取完毕.`));
if (data.length) {
data.forEach(item => {
if (item.text !== '市辖区') {
pCurAreas[item.code] = item.text.indexOf('办事处') > -1 ? item.text.slice(0, -3) : item.text;
}
});
pcaa[code] = pCurAreas;
}
await timeout(1500);
}
} else {
const curAreas = {};
const cityCode = pcity.code;
if (pcity.text == '市辖区') {
// 4个直辖市
res[cityCode] = provinces['86'][pcode];
}
else {
res[cityCode] = pcity.text;

// 第三级数据
pareas.forEach(parea => {
if (parea.text !== '市辖区') {
curAreas[formatCode(parea.code, pcity.text)] = parea.text.indexOf('办事处') > -1 ? parea.text.slice(0, -3) : parea.text;
}
});
pcaa[cityCode] = curAreas;
}

// 第三级数据
pareas.forEach(parea => {
if (parea.text !== '市辖区') {
curAreas[parea.code] = parea.text.indexOf('办事处') > -1 ? parea.text.slice(0, -3) : parea.text;
}
});
pcaa[cityCode] = curAreas;
}
pcaa[pcode] = res;
// 添加港澳台数据
pcaa = {...pcaa, ...extra}
}
}

writeFileSync('pcaa.js', pcaa);
await browser.close();
}

formatPCAddress()
formatPCAAddress();
formatPCAddress();
formatPCAAddress();
20 changes: 10 additions & 10 deletions index.js
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ const spinner2 = ora({

const provinces = require('./provinces')['86'];
const pcodes = [];
const target = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/#{route}.html';
const target = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/#{route}.html';

let cities = [];

if (fs.existsSync(path.resolve(__dirname, 'cities.js'))) {
cities = require('./cities.js');
}
//if (fs.existsSync(path.resolve(__dirname, 'cities.js'))) {
// cities = require('./cities.js');
//}

let areas = [];
let url = '';
Expand Down Expand Up @@ -103,7 +103,7 @@ process.on('unhandledRejection', (err) => {
if (!cities.length) {
for(let i = 0, l = pcodes.length; i < l; i++) {
const pcode = pcodes[i];
await timeout(1500);
await timeout(1000);
const [err] = await awaitTo(getCitiesByPCode(page, pcode));
if (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
Expand All @@ -123,17 +123,17 @@ process.on('unhandledRejection', (err) => {

for(let i = 0, l = cities.length; i < l; i++) {
const city = cities[i];
await timeout(3000);
const [err] = await awaitTo(getAreasByCCode(page, city));
if (err) {
await timeout(1000);
let [err] = await awaitTo(getAreasByCCode(page, city));
while (err) {
// 这个重试主要是处理因避免耗时(Navigation Timeout Exceeded)导致的错误
console.log('\n', chalk.red(`抓取数据失败,失败链接: ${url},错误信息: ${err.message},正在重试....\n`));
await getAreasByCCode(page, city);
[err] = await awaitTo(getAreasByCCode(page, city));
}
}

writeFileSync('areas.js', areas);
spinner2.succeed(chalk.green('县区数据抓取完毕'));

await browser.close();
})();
})();
Loading