批量处理社保局网站数据

背景:提升工作效率,一次性处理N年的数据,同时合并表格数据,便于二次处理。

社保数据结构

File: 初始化爬取数据:

<?php
// File: init-shell.php
declare(strict_types=1);

$idCardList = [
    '张三',
];

foreach ($idCardList as $idCard) {
    $date = strtotime("2010-01-01");
    $endDate = strtotime("2020-09-01");
    for (; $date < $endDate;) {
        $startTimeText = date("Ym", $date);
        $nextDate = strtotime("+6 months", $date);
        if ($nextDate > $endDate) {
            $nextDate = $endDate;
        }
        $endTimeText = date("Ym", strtotime("-1 month", $nextDate));

        echo "echo "ID: {$idCard}, S:{$startTimeText}, E:{$endTimeText}"", PHP_EOL;
        $date = $nextDate;

        $name = urlencode(mb_convert_encoding($idCard,"GBK","UTF-8"));

        $tmp = <<<SHELL
curl 'https://bdyw.guangdong.chinatax.gov.cn/wssw/sbglsjController.do?cmd=sbglsj_sbgrmxcx_list' \
  -H 'Connection: keep-alive' \
  -H 'Cache-Control: max-age=0' \
  -H 'Origin: https://bdyw.guangdong.chinatax.gov.cn' \
  -H 'Upgrade-Insecure-Requests: 1' \
  -H 'DNT: 1' \
  -H 'Content-Type: application/x-www-form-urlencoded' \
  -H 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36' \
  -H 'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9' \
  -H 'Sec-Fetch-Site: same-origin' \
  -H 'Sec-Fetch-Mode: navigate' \
  -H 'Sec-Fetch-User: ?1' \
  -H 'Sec-Fetch-Dest: document' \
  -H 'Referer: https://bdyw.guangdong.chinatax.gov.cn/wssw/sbglsjController.do?cmd=sbglsj_sbgrmxcx_list' \
  -H 'Accept-Language: zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7,zh-HK;q=0.6,ja-JP;q=0.5,ja;q=0.4' \
  -H $'Cookie: [YOUR COOKIE]' \
  --data-raw '[YOUR FORM TABLE]&itemvalue={$name}&ssqq={$startTimeText}&ssqz={$endTimeText}&textfield323=&textfield323=&textfield323=' \
  --compressed \
  -o {$idCard}.{$startTimeText}-{$endTimeText}.html
SHELL
;

        echo $tmp, PHP_EOL, PHP_EOL, PHP_EOL;
    }
    echo PHP_EOL, PHP_EOL, PHP_EOL;
}

获取数据,并得到相关文件:

php init-shell.php > shell.sh
bash shell.sh

得到汇总数据表

<?php
// File: build-full-table.php
declare(strict_types=1);

$list = glob("/tmp/sbs/*.html");
sort($list);

$fullTable = [];

foreach ($list as $file) {
    $item = explode(".", basename($file));
    $name = $item[0];
    $startMonth = explode("-", $item[1])[0];
    $endMonth = explode("-", $item[1])[1];

    $content = mb_convert_encoding(file_get_contents($file), "UTF-8", "GB2312");

    if (strpos($content, '查询不到个人社保费申报缴款明细信息') > 0) {
        // echo sprintf("FILE: %s, 查询不到个人社保费申报缴款明细信息", $file), PHP_EOL;
        continue;
    }
    echo sprintf("File: %s, Name: %s, S: %s, E: %s%s", $file, $name, $startMonth, $endMonth, PHP_EOL);

    $table =
        preg_match('/<table width="100%" border="1" cellpadding="2" cellspacing="0" bordercolor="#111111" style="border-collapse: collapse">[\s\S]*?<\/table>/',
            $content, $matches);

    if ($table !== 1 || empty($matches[0])) {
        echo sprintf("File: %s, 无有效的匹配", $file), PHP_EOL;
        exit(1);
    }

    $match = $matches[0];
    $match = str_replace("\r\n", "\n", $match);
    $match = implode("\n", array_map(static function ($a) {
        return trim($a);
    }, explode("\n", $match)));
    $match = preg_replace("/[\n]{2,}/", "\n", $match);
    $match = preg_replace("/[ ]{2,}/", " ", $match);

    // echo $match;

    $html = <<<HTML5
<!DOCTYPE html>
<html lang="zh-CN">
<head>
  <meta charset="UTF-8">
  <title>Title</title>
</head>
<body>
{$match}
</body>
</html>
HTML5
;

    $document = new DOMDocument();
    if (!$document->loadHTML($html)) {
        echo sprintf("File: %s, 无法加载Html", $file), PHP_EOL;
        exit(1);
    }

    $table = $document->getElementsByTagName('table');

    if ($table->length !== 1) {
        echo sprintf("File: %s, 解析Table失败", $file), PHP_EOL;
        exit(1);
    }

    /** @var DOMElement $tableNode */
    $tableNode = $table->item(0);

    $xpath = new DOMXpath($document);
    $result = $xpath->query("//*/tr[1]/td/input");
    if ($result->length !== 3) {
        echo sprintf("File: %s, 解析Table失败, 无法解析姓名等信息", $file), PHP_EOL;
        exit(1);
    }
    $userInfo = [];
    $userInfoIndex = [
        0 => 'name',
        1 => 'idCard',
        2 => 'sbCard',
    ];
    foreach ($result as $index => $node) {
        /** @var DOMElement $node */
        $value = trim($node->getAttribute('value'));
        $userInfo[$userInfoIndex[$index]] = trim($value);
    }
    if ($userInfo['name'] !== $name) {
        echo sprintf("File: %s, 解析Table失败, 无法匹配姓名等信息", $file), PHP_EOL;
        exit(1);
    }

    $result = $xpath->query("//*/tr[2]/td");

    $tableHeadMap = [];
    foreach ($result as $node) {
        /** @var DOMElement $node */
        $value = trim($node->nodeValue);
        $rowspan = $node->getAttribute('rowspan');
        $colspan = $node->getAttribute('colspan');
        $tableHeadMap[] = [
            'name'    => $value,
            'rowSpan' => (int) (empty($rowspan) ? 1 : $rowspan),
            'colSpan' => (int) (empty($colspan) ? 1 : $colspan),
        ];
    }

    $result = $xpath->query("//*/tr[3]/td");

    $tableHeadMap2 = [];
    foreach ($result as $node) {
        /** @var DOMElement $node */
        $value = trim($node->nodeValue);
        $rowspan = $node->getAttribute('rowspan');
        $colspan = $node->getAttribute('colspan');
        $tableHeadMap2[] = [
            'name' => $value,
        ];
    }

    $tableHeaderBase = [
        '姓名'     => $userInfo['name'],
        '查询开始时间' => $startMonth,
        '查询结束时间' => $endMonth,
        '身份证'    => $userInfo['idCard'],
        '社保号码'   => $userInfo['sbCard'],
    ];
    $tableHeader = [];
    $header2Cu = 0;
    foreach ($tableHeadMap as $head) {
        if ($head['rowSpan'] === 2) {
            $tableHeader[] = $head['name'];
        } elseif ($head['rowSpan'] === 1) {
            $pName = $head['name'];
            for ($i = 0; $i < $head['colSpan']; $i++) {
                if (!isset($tableHeadMap2[$header2Cu])) {
                    echo sprintf("File: %s, Header 表头数据不匹配失败: %s", $file,
                        json_encode($head, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
                    exit(1);
                }
                $tableHeader[] = $pName.' - '.$tableHeadMap2[$header2Cu++]['name'];
            }
        } else {
            echo sprintf("File: %s, Header 解析失败: %s", $file,
                json_encode($head, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
            exit(1);
        }
    }

    $countHead = count($tableHeader);
    if (count(array_unique($tableHeader)) !== $countHead) {
        echo sprintf("File: %s, Header 存在重复数据: %s", $file,
            json_encode($tableHeader, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT)), PHP_EOL;
        exit(1);
    }

    echo sprintf("File: %s, 获取到表头,共计: %d", $file, count($tableHeader)), PHP_EOL;

    // echo implode("\n", $tableHeader);

    $result = $xpath->query("//*/tr[position() > 3]");
    if ($result === false || $result->length < 1) {
        echo sprintf("File: %s, 无法解析Table数据", $file), PHP_EOL;
        exit(1);
    }
    echo sprintf("File: %s, 获取 %d 条数据", $file, $result->length), PHP_EOL;
    foreach ($result as $node) {
        /** @var DOMElement $node */

        $listQuery = $xpath->query("./td/input", $node);
        if (empty($listQuery) || $listQuery->length !== $countHead) {
            echo sprintf("File: %s, 表头数据获取不一致", $file), PHP_EOL;
            exit(1);
        }

        $rowValue = [];
        $index = 0;
        foreach ($listQuery as $rowNode) {
            /** @var DOMElement $rowNode */
            $rowValue[$tableHeader[$index++]] = trim($rowNode->getAttribute('value'));
        }
        $fullTable[] = array_merge(
            $tableHeaderBase, $rowValue
        );
    }
}

file_put_contents("/tmp/sbs/t.json", json_encode($fullTable, JSON_UNESCAPED_UNICODE | JSON_PRETTY_PRINT));

导出并生成最终的csv表格

<?php
// FILE: genreate-csv.php
declare(strict_types=1);

$fullTable = json_decode(file_get_contents("/tmp/sbs/t.json"), true);

$exportHeader1 = [];
$exportHeader1Map = [];
$exportHeader2 = [];
$exportHeader2Map = [];

foreach ($fullTable as $item) {
    $keys = array_keys($item);
    foreach ($keys as $key) {
        if (!in_array($key, $exportHeader1, true)) {
            $exportHeader1[] = $key;
            $exportHeader1Map[$key] = count($exportHeader1) - 1;
        }

        $keySimple = trim(preg_replace("/\\([\d.]+%\\)/","", $key));
        if (!in_array($keySimple, $exportHeader2, true)) {
            $exportHeader2[] = $keySimple;
        }
        if (!isset($exportHeader2Map[$key])) {
            $exportHeader2Map[$key] = array_search($keySimple, $exportHeader2, true);
        }
    }
}

echo "表头1", implode(", ", $exportHeader1), PHP_EOL;
print_r($exportHeader1Map);
echo PHP_EOL;
echo "表头2", implode(", ", $exportHeader2), PHP_EOL;
print_r($exportHeader2Map);
echo PHP_EOL;

$fp1 = fopen("/tmp/sbs/export-full.csv", "wb");
$fp2 = fopen("/tmp/sbs/export-simple.csv", "wb");
if (!$fp1) {
    echo "处理失败: 无法打开文件: fp1";
    exit;
}
if (!$fp2) {
    echo "处理失败: 无法打开文件: fp2";
    exit;
}

fwrite($fp1, hex2bin('EFBBBF'));//注释当前为UTF-8编码
fwrite($fp2, hex2bin('EFBBBF'));//注释当前为UTF-8编码

fputcsv($fp1, $exportHeader1);
fputcsv($fp2, $exportHeader2);

foreach ($fullTable as $item) {
    // table1
    $row1 = array_values(array_fill_keys($exportHeader1, ""));
    // table2
    $row2 = array_values(array_fill_keys($exportHeader2, ""));
    foreach ($item as $k => $v) {
        $row1[$exportHeader1Map[$k]] = $v;
        $row2[$exportHeader2Map[$k]] = $v;
    }
    fputcsv($fp1, $row1);
    fputcsv($fp2, $row2);
}

fclose($fp1);
fclose($fp2);
exit(0);

当前还没有任何评论

写下你最简单的想法