背景:提升工作效率,一次性处理N年的数据,同时合并表格数据,便于二次处理。
File: 初始化爬取数据:
<?php
// File: init-shell.php
declare(strict_types=1);
$idCardList = [
'张三',
];
foreach ($idCardList as $idCard) {
$date = strtotime("2010-01-01");
$endDate = strtotime("2020-09-01");
for (; $date < $endDate;) {
$startTimeText = date("Ym", $date);
$nextDate = strtotime("+6 months", $date);
if ($nextDate > $endDate) {
$nextDate = $endDate;
}
$endTimeText = date("Ym", strtotime("-1 month", $nextDate));
echo "echo "ID: {$idCard}, S:{$startTimeText}, E:{$endTimeText}"", PHP_EOL;
$date = $nextDate;
$name = urlencode(mb_convert_encoding($idCard,"GBK","UTF-8"));
$tmp = <<<SHELL
curl 'https://bdyw.guangdong.chinatax.gov.cn/wssw/sbglsjController.do?cmd=sbglsj_sbgrmxcx_list' \
-H 'Connection: keep-alive' \
-H 'Cache-Control: max-age=0' \
-H 'Origin: https://bdyw.guangdong.chinatax.gov.cn' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'DNT: 1' \
-H 'Content-Type: application/x-www-form-urlencoded' \
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Sec-Fetch-Site: same-origin' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-User: ?1' \
-H 'Sec-Fetch-Dest: document' \
-H 'Referer: https://bdyw.guangdong.chinatax.gov.cn/wssw/sbglsjController.do?cmd=sbglsj_sbgrmxcx_list' \
-H 'Accept-Language: zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,ja-JP;q=0.5,ja;q=0.4' \
-H $'Cookie: [YOUR COOKIE]' \
--data-raw '[YOUR FORM TABLE]&itemvalue={$name}&ssqq={$startTimeText}&ssqz={$endTimeText}&textfield323=&textfield323=&textfield323=' \
--compressed \
-o {$idCard}.{$startTimeText}-{$endTimeText}.html
SHELL;
echo $tmp, PHP_EOL, PHP_EOL, PHP_EOL;
}
echo PHP_EOL, PHP_EOL, PHP_EOL;
}
// File: init-shell.php
declare(strict_types=1);
$idCardList = [
'张三',
];
foreach ($idCardList as $idCard) {
$date = strtotime("2010-01-01");
$endDate = strtotime("2020-09-01");
for (; $date < $endDate;) {
$startTimeText = date("Ym", $date);
$nextDate = strtotime("+6 months", $date);
if ($nextDate > $endDate) {
$nextDate = $endDate;
}
$endTimeText = date("Ym", strtotime("-1 month", $nextDate));
echo "echo "ID: {$idCard}, S:{$startTimeText}, E:{$endTimeText}"", PHP_EOL;
$date = $nextDate;
$name = urlencode(mb_convert_encoding($idCard,"GBK","UTF-8"));
$tmp = <<<SHELL
curl 'https://bdyw.guangdong.chinatax.gov.cn/wssw/sbglsjController.do?cmd=sbglsj_sbgrmxcx_list' \
-H 'Connection: keep-alive' \
-H 'Cache-Control: max-age=0' \
-H 'Origin: https://bdyw.guangdong.chinatax.gov.cn' \
-H 'Upgrade-Insecure-Requests: 1' \
-H 'DNT: 1' \
-H 'Content-Type: application/x-www-form-urlencoded' \
-H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36' \
-H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
-H 'Sec-Fetch-Site: same-origin' \
-H 'Sec-Fetch-Mode: navigate' \
-H 'Sec-Fetch-User: ?1' \
-H 'Sec-Fetch-Dest: document' \
-H 'Referer: https://bdyw.guangdong.chinatax.gov.cn/wssw/sbglsjController.do?cmd=sbglsj_sbgrmxcx_list' \
-H 'Accept-Language: zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,ja-JP;q=0.5,ja;q=0.4' \
-H $'Cookie: [YOUR COOKIE]' \
--data-raw '[YOUR FORM TABLE]&itemvalue={$name}&ssqq={$startTimeText}&ssqz={$endTimeText}&textfield323=&textfield323=&textfield323=' \
--compressed \
-o {$idCard}.{$startTimeText}-{$endTimeText}.html
SHELL;
echo $tmp, PHP_EOL, PHP_EOL, PHP_EOL;
}
echo PHP_EOL, PHP_EOL, PHP_EOL;
}
获取数据,并得到相关文件:
php init-shell.php > shell.sh
bash shell.sh
bash shell.sh
得到汇总数据表
<?php
// File: build-full-table.php
declare(strict_types=1);
$list = glob("/tmp/sbs/*.html");
sort($list);
$fullTable = [];
foreach ($list as $file) {
$item = explode(".", basename($file));
$name = $item[0];
$startMonth = explode("-", $item[1])[0];
$endMonth = explode("-", $item[1])[1];
$content = mb_convert_encoding(file_get_contents($file), "UTF-8", "GB2312");
if (strpos($content, '查询不到个人社保费申报缴款明细信息') > 0) {
// echo sprintf("FILE: %s, 查询不到个人社保费申报缴款明细信息", $file), PHP_EOL;
continue;
}
echo sprintf("File: %s, Name: %s, S: %s, E: %s%s", $file, $name, $startMonth, $endMonth, PHP_EOL);
$table =
preg_match('/<table width="100%" border="1" cellpadding="2" cellspacing="0" bordercolor="#111111" style="border-collapse: collapse">[\s\S]*?<\/table>/',
$content, $matches);
if ($table !== 1 || empty($matches[0])) {
echo sprintf("File: %s, 无有效的匹配", $file), PHP_EOL;
exit(1);
}
$match = $matches[0];
$match = str_replace("\r\n", "\n", $match);
$match = implode("\n", array_map(static function ($a) {
return trim($a);
}, explode("\n", $match)));
$match = preg_replace("/[\n]{2,}/", "\n", $match);
$match = preg_replace("/[ ]{2,}/", " ", $match);
// echo $match;
$html = <<<HTML5
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
{$match}
</body>
</html>
HTML5;
$document = new DOMDocument();
if (!$document->loadHTML($html)) {
echo sprintf("File: %s, 无法加载Html", $file), PHP_EOL;
exit(1);
}
$table = $document->getElementsByTagName('table');
if ($table->length !== 1) {
echo sprintf("File: %s, 解析Table失败", $file), PHP_EOL;
exit(1);
}
/** @var DOMElement $tableNode */
$tableNode = $table->item(0);
$xpath = new DOMXpath($document);
$result = $xpath->query("//*/tr[1]/td/input");
if ($result->length !== 3) {
echo sprintf("File: %s, 解析Table失败, 无法解析姓名等信息", $file), PHP_EOL;
exit(1);
}
$userInfo = [];
$userInfoIndex = [
0 => 'name',
1 => 'idCard',
2 => 'sbCard',
];
foreach ($result as $index => $node) {
/** @var DOMElement $node */
$value = trim($node->getAttribute('value'));
$userInfo[$userInfoIndex[$index]] = trim($value);
}
if ($userInfo['name'] !== $name) {
echo sprintf("File: %s, 解析Table失败, 无法匹配姓名等信息", $file), PHP_EOL;
exit(1);
}
$result = $xpath->query("//*/tr[2]/td");
$tableHeadMap = [];
foreach ($result as $node) {
/** @var DOMElement $node */
$value = trim($node->nodeValue);
$rowspan = $node->getAttribute('rowspan');
$colspan = $node->getAttribute('colspan');
$tableHeadMap[] = [
'name' => $value,
'rowSpan' => (int) (empty($rowspan) ? 1 : $rowspan),
'colSpan' => (int) (empty($colspan) ? 1 : $colspan),
];
}
$result = $xpath->query("//*/tr[3]/td");
$tableHeadMap2 = [];
foreach ($result as $node) {
/** @var DOMElement $node */
$value = trim($node->nodeValue);
$rowspan = $node->getAttribute('rowspan');
$colspan = $node->getAttribute('colspan');
$tableHeadMap2[] = [
'name' => $value,
];
}
$tableHeaderBase = [
'姓名' => $userInfo['name'],
'查询开始时间' => $startMonth,
'查询结束时间' => $endMonth,
'身份证' => $userInfo['idCard'],
'社保号码' => $userInfo['sbCard'],
];
$tableHeader = [];
$header2Cu = 0;
foreach ($tableHeadMap as $head) {
if ($head['rowSpan'] === 2) {
$tableHeader[] = $head['name'];
} elseif ($head['rowSpan'] === 1) {
$pName = $head['name'];
for ($i = 0; $i < $head['colSpan']; $i++) {
if (!isset($tableHeadMap2[$header2Cu])) {
echo sprintf("File: %s, Header 表头数据不匹配失败: %s", $file,
json_encode($head, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
exit(1);
}
$tableHeader[] = $pName.' - '.$tableHeadMap2[$header2Cu++]['name'];
}
} else {
echo sprintf("File: %s, Header 解析失败: %s", $file,
json_encode($head, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
exit(1);
}
}
$countHead = count($tableHeader);
if (count(array_unique($tableHeader)) !== $countHead) {
echo sprintf("File: %s, Header 存在重复数据: %s", $file,
json_encode($tableHeader, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
exit(1);
}
echo sprintf("File: %s, 获取到表头,共计: %d", $file, count($tableHeader)), PHP_EOL;
// echo implode("\n", $tableHeader);
$result = $xpath->query("//*/tr[position() > 3]");
if ($result === false || $result->length < 1) {
echo sprintf("File: %s, 无法解析Table数据", $file), PHP_EOL;
exit(1);
}
echo sprintf("File: %s, 获取 %d 条数据", $file, $result->length), PHP_EOL;
foreach ($result as $node) {
/** @var DOMElement $node */
$listQuery = $xpath->query("./td/input", $node);
if (empty($listQuery) || $listQuery->length !== $countHead) {
echo sprintf("File: %s, 表头数据获取不一致", $file), PHP_EOL;
exit(1);
}
$rowValue = [];
$index = 0;
foreach ($listQuery as $rowNode) {
/** @var DOMElement $rowNode */
$rowValue[$tableHeader[$index++]] = trim($rowNode->getAttribute('value'));
}
$fullTable[] = array_merge(
$tableHeaderBase, $rowValue
);
}
}
file_put_contents("/tmp/sbs/t.json", json_encode($fullTable, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
// File: build-full-table.php
declare(strict_types=1);
$list = glob("/tmp/sbs/*.html");
sort($list);
$fullTable = [];
foreach ($list as $file) {
$item = explode(".", basename($file));
$name = $item[0];
$startMonth = explode("-", $item[1])[0];
$endMonth = explode("-", $item[1])[1];
$content = mb_convert_encoding(file_get_contents($file), "UTF-8", "GB2312");
if (strpos($content, '查询不到个人社保费申报缴款明细信息') > 0) {
// echo sprintf("FILE: %s, 查询不到个人社保费申报缴款明细信息", $file), PHP_EOL;
continue;
}
echo sprintf("File: %s, Name: %s, S: %s, E: %s%s", $file, $name, $startMonth, $endMonth, PHP_EOL);
$table =
preg_match('/<table width="100%" border="1" cellpadding="2" cellspacing="0" bordercolor="#111111" style="border-collapse: collapse">[\s\S]*?<\/table>/',
$content, $matches);
if ($table !== 1 || empty($matches[0])) {
echo sprintf("File: %s, 无有效的匹配", $file), PHP_EOL;
exit(1);
}
$match = $matches[0];
$match = str_replace("\r\n", "\n", $match);
$match = implode("\n", array_map(static function ($a) {
return trim($a);
}, explode("\n", $match)));
$match = preg_replace("/[\n]{2,}/", "\n", $match);
$match = preg_replace("/[ ]{2,}/", " ", $match);
// echo $match;
$html = <<<HTML5
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
{$match}
</body>
</html>
HTML5;
$document = new DOMDocument();
if (!$document->loadHTML($html)) {
echo sprintf("File: %s, 无法加载Html", $file), PHP_EOL;
exit(1);
}
$table = $document->getElementsByTagName('table');
if ($table->length !== 1) {
echo sprintf("File: %s, 解析Table失败", $file), PHP_EOL;
exit(1);
}
/** @var DOMElement $tableNode */
$tableNode = $table->item(0);
$xpath = new DOMXpath($document);
$result = $xpath->query("//*/tr[1]/td/input");
if ($result->length !== 3) {
echo sprintf("File: %s, 解析Table失败, 无法解析姓名等信息", $file), PHP_EOL;
exit(1);
}
$userInfo = [];
$userInfoIndex = [
0 => 'name',
1 => 'idCard',
2 => 'sbCard',
];
foreach ($result as $index => $node) {
/** @var DOMElement $node */
$value = trim($node->getAttribute('value'));
$userInfo[$userInfoIndex[$index]] = trim($value);
}
if ($userInfo['name'] !== $name) {
echo sprintf("File: %s, 解析Table失败, 无法匹配姓名等信息", $file), PHP_EOL;
exit(1);
}
$result = $xpath->query("//*/tr[2]/td");
$tableHeadMap = [];
foreach ($result as $node) {
/** @var DOMElement $node */
$value = trim($node->nodeValue);
$rowspan = $node->getAttribute('rowspan');
$colspan = $node->getAttribute('colspan');
$tableHeadMap[] = [
'name' => $value,
'rowSpan' => (int) (empty($rowspan) ? 1 : $rowspan),
'colSpan' => (int) (empty($colspan) ? 1 : $colspan),
];
}
$result = $xpath->query("//*/tr[3]/td");
$tableHeadMap2 = [];
foreach ($result as $node) {
/** @var DOMElement $node */
$value = trim($node->nodeValue);
$rowspan = $node->getAttribute('rowspan');
$colspan = $node->getAttribute('colspan');
$tableHeadMap2[] = [
'name' => $value,
];
}
$tableHeaderBase = [
'姓名' => $userInfo['name'],
'查询开始时间' => $startMonth,
'查询结束时间' => $endMonth,
'身份证' => $userInfo['idCard'],
'社保号码' => $userInfo['sbCard'],
];
$tableHeader = [];
$header2Cu = 0;
foreach ($tableHeadMap as $head) {
if ($head['rowSpan'] === 2) {
$tableHeader[] = $head['name'];
} elseif ($head['rowSpan'] === 1) {
$pName = $head['name'];
for ($i = 0; $i < $head['colSpan']; $i++) {
if (!isset($tableHeadMap2[$header2Cu])) {
echo sprintf("File: %s, Header 表头数据不匹配失败: %s", $file,
json_encode($head, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
exit(1);
}
$tableHeader[] = $pName.' - '.$tableHeadMap2[$header2Cu++]['name'];
}
} else {
echo sprintf("File: %s, Header 解析失败: %s", $file,
json_encode($head, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
exit(1);
}
}
$countHead = count($tableHeader);
if (count(array_unique($tableHeader)) !== $countHead) {
echo sprintf("File: %s, Header 存在重复数据: %s", $file,
json_encode($tableHeader, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
exit(1);
}
echo sprintf("File: %s, 获取到表头,共计: %d", $file, count($tableHeader)), PHP_EOL;
// echo implode("\n", $tableHeader);
$result = $xpath->query("//*/tr[position() > 3]");
if ($result === false || $result->length < 1) {
echo sprintf("File: %s, 无法解析Table数据", $file), PHP_EOL;
exit(1);
}
echo sprintf("File: %s, 获取 %d 条数据", $file, $result->length), PHP_EOL;
foreach ($result as $node) {
/** @var DOMElement $node */
$listQuery = $xpath->query("./td/input", $node);
if (empty($listQuery) || $listQuery->length !== $countHead) {
echo sprintf("File: %s, 表头数据获取不一致", $file), PHP_EOL;
exit(1);
}
$rowValue = [];
$index = 0;
foreach ($listQuery as $rowNode) {
/** @var DOMElement $rowNode */
$rowValue[$tableHeader[$index++]] = trim($rowNode->getAttribute('value'));
}
$fullTable[] = array_merge(
$tableHeaderBase, $rowValue
);
}
}
file_put_contents("/tmp/sbs/t.json", json_encode($fullTable, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));
导出并生成最终的csv表格
<?php
// FILE: genreate-csv.php
declare(strict_types=1);
$fullTable = json_decode(file_get_contents("/tmp/sbs/t.json"), true);
$exportHeader1 = [];
$exportHeader1Map = [];
$exportHeader2 = [];
$exportHeader2Map = [];
foreach ($fullTable as $item) {
$keys = array_keys($item);
foreach ($keys as $key) {
if (!in_array($key, $exportHeader1, true)) {
$exportHeader1[] = $key;
$exportHeader1Map[$key] = count($exportHeader1) - 1;
}
$keySimple = trim(preg_replace("/\\([\d.]+%\\)/","", $key));
if (!in_array($keySimple, $exportHeader2, true)) {
$exportHeader2[] = $keySimple;
}
if (!isset($exportHeader2Map[$key])) {
$exportHeader2Map[$key] = array_search($keySimple, $exportHeader2, true);
}
}
}
echo "表头1", implode(", ", $exportHeader1), PHP_EOL;
print_r($exportHeader1Map);
echo PHP_EOL;
echo "表头2", implode(", ", $exportHeader2), PHP_EOL;
print_r($exportHeader2Map);
echo PHP_EOL;
$fp1 = fopen("/tmp/sbs/export-full.csv", "wb");
$fp2 = fopen("/tmp/sbs/export-simple.csv", "wb");
if (!$fp1) {
echo "处理失败: 无法打开文件: fp1";
exit;
}
if (!$fp2) {
echo "处理失败: 无法打开文件: fp2";
exit;
}
fwrite($fp1, hex2bin('EFBBBF'));//注释当前为UTF-8编码
fwrite($fp2, hex2bin('EFBBBF'));//注释当前为UTF-8编码
fputcsv($fp1, $exportHeader1);
fputcsv($fp2, $exportHeader2);
foreach ($fullTable as $item) {
// table1
$row1 = array_values(array_fill_keys($exportHeader1, ""));
// table2
$row2 = array_values(array_fill_keys($exportHeader2, ""));
foreach ($item as $k => $v) {
$row1[$exportHeader1Map[$k]] = $v;
$row2[$exportHeader2Map[$k]] = $v;
}
fputcsv($fp1, $row1);
fputcsv($fp2, $row2);
}
fclose($fp1);
fclose($fp2);
exit(0);
// FILE: genreate-csv.php
declare(strict_types=1);
$fullTable = json_decode(file_get_contents("/tmp/sbs/t.json"), true);
$exportHeader1 = [];
$exportHeader1Map = [];
$exportHeader2 = [];
$exportHeader2Map = [];
foreach ($fullTable as $item) {
$keys = array_keys($item);
foreach ($keys as $key) {
if (!in_array($key, $exportHeader1, true)) {
$exportHeader1[] = $key;
$exportHeader1Map[$key] = count($exportHeader1) - 1;
}
$keySimple = trim(preg_replace("/\\([\d.]+%\\)/","", $key));
if (!in_array($keySimple, $exportHeader2, true)) {
$exportHeader2[] = $keySimple;
}
if (!isset($exportHeader2Map[$key])) {
$exportHeader2Map[$key] = array_search($keySimple, $exportHeader2, true);
}
}
}
echo "表头1", implode(", ", $exportHeader1), PHP_EOL;
print_r($exportHeader1Map);
echo PHP_EOL;
echo "表头2", implode(", ", $exportHeader2), PHP_EOL;
print_r($exportHeader2Map);
echo PHP_EOL;
$fp1 = fopen("/tmp/sbs/export-full.csv", "wb");
$fp2 = fopen("/tmp/sbs/export-simple.csv", "wb");
if (!$fp1) {
echo "处理失败: 无法打开文件: fp1";
exit;
}
if (!$fp2) {
echo "处理失败: 无法打开文件: fp2";
exit;
}
fwrite($fp1, hex2bin('EFBBBF'));//注释当前为UTF-8编码
fwrite($fp2, hex2bin('EFBBBF'));//注释当前为UTF-8编码
fputcsv($fp1, $exportHeader1);
fputcsv($fp2, $exportHeader2);
foreach ($fullTable as $item) {
// table1
$row1 = array_values(array_fill_keys($exportHeader1, ""));
// table2
$row2 = array_values(array_fill_keys($exportHeader2, ""));
foreach ($item as $k => $v) {
$row1[$exportHeader1Map[$k]] = $v;
$row2[$exportHeader2Map[$k]] = $v;
}
fputcsv($fp1, $row1);
fputcsv($fp2, $row2);
}
fclose($fp1);
fclose($fp2);
exit(0);
当前还没有任何评论