php 【脚本】万级别导出比对数据(省市区差异)

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了php 【脚本】万级别导出比对数据(省市区差异)相关的知识,希望对你有一定的参考价值。

<?php
/**
 * 修复简历地址和地图标记不一致(重点在筛选出要修复的数据)
 * 1. 找出简历
 * 2. 根据坐标获取百度地图详细地址
 * 3. 比对省、市、区 找出差异较大的简历
 * 4. 保存成指定格式文件
 *
 * usage: php repair_job_address.php step=1 ak=dASz7ubuSpHidP1oQWKuAK3q
 *
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-21
 * @copyright 2018 goodjobs.cn
 */
require dirname(__FILE__) . '/../webapp/config.php';
require_once AG_APP_DIR . '/agavi.php';
require_once AG_DOC_ROOT . '/lib/BaseCron.php';
 
class RepairJobAddress extends BaseCron
{
/**
 * 保存职位信息(包括 通过经纬度获取的百度地图地址信息)到文件
 */
    const STEP_STORE_JOB = 1;
 
/**
 * 从保存的职位记录里根据一定的策略刷选出
 * 职位地址和标记的百度坐标地址不一致的记录,
 * 保存到文件
 */
    const STEP_MATCH_ADDRESS = 2;
 
/**
 * 将不一致的记录保存成csv格式
 */
    const STEP_STORE_TO_EXCEL = 3;
 
/**
 * 执行步骤
 * @var int
 */
    protected $step;
 
/**
 * 百度地图调用key
 * @var string
 */
    protected $ak;
 
/**
 * 错误码
 * @var int
 */
    protected $errno;
 
/**
 * 错误信息
 * @var string
 */
    protected $error;
 
/**
 * 日志路径
 * @var string
 */
    protected $logDir;
 
/**
 * 职位信息记录
 * @var string
 */
    protected $logJobsFile;
 
/**
 * 职位地址和百度地图标记地址不匹配记录
 * @var string
 */
    protected $logDiffFile;
 
/**
 * 职位地址和百度地图标记地址不匹配记录,生成excel
 * @var string
 */
    protected $logDiffExcelFile;
 
    public function __construct($cronFile)
    {
        $this->step = self::STEP_STORE_JOB;
        $this->ak   = 'Z2MIXNix38i4SwOC3oAHz3dP4OYlQ6kN';
 
// 确保该文件夹存在
        $this->logDir           = AG_DOC_ROOT . '/var/logs/http_to_https/';
        $this->logJobsFile      = $this->logDir . 'jobaddress.log';
        $this->logDiffFile      = $this->logDir . 'jobaddress_diff.log';
        $this->logDiffExcelFile = $this->logDir . 'jobaddress_diff.csv';
 
        parent::__construct($cronFile);
    }
 
/**
 * 业务脚本执行核心逻辑
 * @return void
 */
    protected function run()
    {
        switch ($this->step) {
            case self::STEP_STORE_JOB:
                if (!$this->storeJobInfoAndBaiduAddress()) {
                    echo $this->error . PHP_EOL;
                }
                break;
            case self::STEP_MATCH_ADDRESS:
                if (!$this->checkMatchingFromFile()) {
                    echo $this->error . PHP_EOL;
                }
                break;
 
            case self::STEP_STORE_TO_EXCEL:
                if (!$this->storeJobInfoToExcel()) {
                    echo $this->error . PHP_EOL;
                }
                break;
 
            default:
                echo 'Unknown step ' . $this->step . PHP_EOL;
                break;
        }
 
        echo 'STEP ' . $this->step . ' done.' . PHP_EOL;
    }
 
/**
 * 使用空逻辑覆盖父类
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-14
 * @return void
 */
    protected function beforeRun()
    {
    }
 
/**
 * 使用空逻辑覆盖父类
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-14
 * @return void
 */
    protected function afterRun()
    {
    }
 
/**
 * 保存职位信息和对应的百度地图地址
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-22
 * @return bool
 */
    protected function storeJobInfoAndBaiduAddress()
    {
        $batchNum  = 1000;
        $lastJobId = $this->loadLastJobIdFromFile();
        $today     = date('Y-m-d');
 
        $batchTimes = 0;
 
        do {
            $sql = sprintf("select JOB.job_id, JOB.job_alias_name, JOB.province, JOB.city, JOB.district, JOB.address, JOB.linkman, JOB.phone, JOB.mem_corp_id, JOB.account_id, JOB.corp_id, JOB.map_lat, JOB.map_lng, CORP.corp_name from t_jobs as JOB left join t_corp_info as CORP ON JOB.corp_id=CORP.corp_id where JOB.job_status='NORMAL' and JOB.date_end>%s and JOB.job_id>%u order by JOB.job_id asc limit %u", "'" . $today . "'", $lastJobId, $batchNum);
            try {
                $stmt = $this->dbConnection->prepareStatement($sql);
                $rs   = $stmt->executeQuery(ResultSet::FETCHMODE_ASSOC);
            } catch (Exception $ex) {
                $this->error = 'sql exception: ' . $ex->getMessage();
                return false;
            }
 
            echo 'SQL: ' . $this->dbConnection->lastQuery . PHP_EOL;
 
            while ($rs->next()) {
                $jobId        = $rs->getInt('job_id');
                $jobAliasName = $rs->getString('job_alias_name');
                $province     = $rs->getInt('province');
                $city         = $rs->getInt('city');
                $district     = $rs->getInt('district');
                $address      = $rs->getString('address');
                $linkman      = $rs->getString('linkman');
                $phone        = $rs->getString('phone');
                $memCorpId    = $rs->getInt('mem_corp_id');
                $accountId    = $rs->getInt('account_id');
                $corpId       = $rs->getInt('corp_id');
                $mapLat       = $rs->getString('map_lat');
                $mapLng       = $rs->getString('map_lng');
                $corpName     = $rs->getString('corp_name');
 
                if (!empty($mapLat) && !empty($mapLng)) {
// 抓取百度地图坐标地址
                    $addressFromBd = $this->getAddressFromBaiduMap($mapLat . ',' . $mapLng);
                    if ($addressFromBd === false) {
                        return false;
                    }
                } else {
                    $addressFromBd = null;
                }
 
                $this->addLogToFile($this->logJobsFile, [
                    'job_id'         => $jobId,
                    'job_alias_name' => $jobAliasName,
                    'corp_id'        => $corpId,
                    'corp_name'      => $corpName,
                    'mem_corp_id'    => $memCorpId,
                    'account_id'     => $accountId,
                    'linkman'        => $linkman,
                    'phone'          => $phone,
                    'address'        => $address,
                    'province'       => $province,
                    'city'           => $city,
                    'district'       => $district,
                    'map_lat'        => $mapLat,
                    'map_lng'        => $mapLng,
                    'baidu_address'  => $addressFromBd,
                ]);
 
                $lastJobId = $jobId;
            }
 
            if ($batchTimes++ > 1) {
// for debug
                // break;
            }
 
        } while ($rs->getRecordCount() == $batchNum);
 
        return true;
    }
 
/**
 * 根据坐标获取百度地图地址信息
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-21
 * @param string $location 坐标
 * @return bool|array
 */
    protected function getAddressFromBaiduMap($location)
    {
        $url  = sprintf('http://api.map.baidu.com/geocoder/v2/?location=%s&output=json&ak=%s', $location, $this->ak);
        $curl = curl_init();
        curl_setopt($curl, CURLOPT_URL, $url);
        curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
        $addressFromBd = curl_exec($curl);
        if (curl_errno($curl)) {
            $this->error = 'curl error: ' . curl_error($curl) . '; location: ' . $location;
            curl_close($curl);
            return false;
        }
 
        curl_close($curl);
 
        $addressFromBd = json_decode($addressFromBd, true);
        if (empty($addressFromBd)) {
            $this->error = 'json_decode error: ' . var_export($addressFromBd, true) . '; location: ' . $location;
            return false;
        }
        if ($addressFromBd['status'] != 0) {
            $this->error = 'baidu map api return error: ' . var_export($addressFromBd, true) . '; location: ' . $location;
            return false;
        }
        if (empty($addressFromBd['result']['formatted_address'])) {
// $this->error = 'baidu map api return empty address: '.var_export($addressFromBd, true).'; location: '.$location;
            // return false;
        }
 
        return ['formatted_address' => $addressFromBd['result']['formatted_address'],
            'province'                  => $addressFromBd['result']['addressComponent']['province'],
            'city'                      => $addressFromBd['result']['addressComponent']['city'],
            'district'                  => $addressFromBd['result']['addressComponent']['district'],
            'street'                    => $addressFromBd['result']['addressComponent']['street'],
            'street_number'             => $addressFromBd['result']['addressComponent']['street_number'],
        ];
    }
 
/**
 * 检测文件内的地址信息匹配情况,生成新的文件
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-22
 * @return bool
 */
    protected function checkMatchingFromFile()
    {
        $fp = @fopen($this->logJobsFile, 'r');
        if (empty($fp)) {
            $this->error = 'job file not exists';
            return false;
        }
// 用于缓存从数据库里取出的数据,省 市 区
        $typeCache = [];
// 从文件中读取数据,一行一行的读
        while (!feof($fp)) {
            $line    = fgets($fp);
            $jobInfo = json_decode($line, true);
            if (empty($jobInfo)) {
// 成功
                break;
            }
            if (is_null($jobInfo['baidu_address'])) {
// 没有填写经纬度,不考虑
                continue;
            }
 
// sql条件 in
            $inSet = [];
// 地区映射关系
            $typeSet = [];
 
            if ($jobInfo['province']) {
                if (isset($typeCache[$jobInfo['province']])) {
                    $typeSet[$jobInfo['province']] = $typeCache[$jobInfo['province']];
                } else {
                    $inSet[] = intval($jobInfo['province']);
                }
            }
            if ($jobInfo['city']) {
// 对巢湖兼容
                if ($jobInfo['city'] == 1046) {
                    $jobInfo['city']     = 1043;
                    $jobInfo['district'] = 5563;
                }
                if (isset($typeCache[$jobInfo['city']])) {
                    $typeSet[$jobInfo['city']] = $typeCache[$jobInfo['city']];
                } else {
                    $inSet[] = intval($jobInfo['city']);
                }
            }
            if ($jobInfo['district']) {
                if (isset($typeCache[$jobInfo['district']])) {
                    $typeSet[$jobInfo['district']] = $typeCache[$jobInfo['district']];
                } else {
                    $inSet[] = intval($jobInfo['district']);
                }
            }
            if (empty($inSet) && empty($typeSet)) {
                $this->error = 'province city district all empty';
                return false;
            }
 
            if (!empty($inSet)) {
                $sql = sprintf("select type_id, type_name from t_types where type_id in(%s)", implode(',', $inSet));
                try {
                    $stmt = $this->dbConnection->prepareStatement($sql);
                    $rs   = $stmt->executeQuery(ResultSet::FETCHMODE_ASSOC);
                } catch (Exception $ex) {
                    $this->error = 'sql exception: ' . $ex->getMessage();
                    return false;
                }
 
                echo 'SQL: ' . $this->dbConnection->lastQuery . PHP_EOL;
 
                while ($rs->next()) {
                    $typeSet[$rs->getInt('type_id')]   = $rs->getString('type_name');
                    $typeCache[$rs->getInt('type_id')] = $rs->getString('type_name');
                }
            }
 
            $goodjobsAddress = [
                'province' => isset($typeSet[$jobInfo['province']]) ? $typeSet[$jobInfo['province']] : '',
                'city'     => isset($typeSet[$jobInfo['city']]) ? $typeSet[$jobInfo['city']] : '',
                'district' => isset($typeSet[$jobInfo['district']]) ? $typeSet[$jobInfo['district']] : '',
                'address'  => $jobInfo['address'],
            ];
            $baiduAddress = [
                'province' => $jobInfo['baidu_address']['province'],
                'city'     => $jobInfo['baidu_address']['city'],
                'district' => $jobInfo['baidu_address']['district'],
                'address'  => $jobInfo['baidu_address']['formatted_address'],
            ];
 
            $matchResult = $this->checkMatching($goodjobsAddress, $baiduAddress);
// 匹配,无需操作
            if ($matchResult['status'] == 0) {
                continue;
            }
            $jobInfo['province'] = $goodjobsAddress['province'];
            $jobInfo['city']     = $goodjobsAddress['city'];
            $jobInfo['district'] = $goodjobsAddress['district'];
            $jobInfo['reason']   = $matchResult['reason'];
            $this->addLogToFile($this->logDiffFile, $jobInfo);
        }
 
        fclose($fp);
        return true;
    }
 
/**
 * 检测匹配
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-22
 * @param array $goodjobsAddress 职位中的地址
 * @param array $baiduAddress 职位中标记的地址
 * @return array
 */
    protected function checkMatching($goodjobsAddress, $baiduAddress)
    {
        $reason = '';
        if ($goodjobsAddress['province'] != $baiduAddress['province']) {
            $reason = '省 不匹配';
        } elseif ($goodjobsAddress['city'] != $baiduAddress['city']) {
            $reason = '市 不匹配';
        } elseif (!empty($goodjobsAddress['district']) && $goodjobsAddress['district'] != $baiduAddress['district']) {
            $reason = '区/县 不匹配';
        } else {
            if (empty($goodjobsAddress['district']) &&
                !empty($baiduAddress['district']) &&
                !empty($goodjobsAddress['address'])) {
// 简历地址中出现了百度地图的 区
                if (!preg_match('#' . $baiduAddress['district'] . '#i', $goodjobsAddress['address'])) {
                    $reason = '区/县 可能不匹配';
                }
            }
        }
        if ($reason) {
            return ['status' => 1, 'reason' => $reason];
        }
        return ['status' => 0, 'reason' => ''];
    }
 
    protected function storeJobInfoToExcel()
    {
        $titleArr = ['职位ID', '职位名称', '所属公司', '联系人', '联系电话', '省', '市', '区', '具体地址', '省(百度)', '市(百度)', '区(百度)', '具体地址(百度)', '不匹配原因'];
        $jobArr   = [];
 
        $fp = @fopen($this->logDiffFile, 'r');
        if (empty($fp)) {
            $this->error = 'job diff file not exists';
            return false;
        }
// 从文件中读取数据,一行一行的读
        while (!feof($fp)) {
            $line    = fgets($fp);
            $jobInfo = json_decode($line, true);
            if (empty($jobInfo)) {
// 成功
                break;
            }
            $jobArr[] = [
                $jobInfo['job_id'],
                $jobInfo['job_alias_name'],
                $jobInfo['corp_name'],
                $jobInfo['linkman'],
                $jobInfo['phone'],
                $jobInfo['province'],
                $jobInfo['city'],
                $jobInfo['district'],
                $jobInfo['address'],
                $jobInfo['baidu_address']['province'],
                $jobInfo['baidu_address']['city'],
                $jobInfo['baidu_address']['district'],
                $jobInfo['baidu_address']['formatted_address'],
                $jobInfo['reason'],
            ];
        }
 
        fclose($fp);
 
        $fp = @fopen($this->logDiffExcelFile, 'a');
        fputcsv($fp, $titleArr);
        foreach ($jobArr as $i => $job) {
            fputcsv($fp, $job);
        }
 
        fclose($fp);
        return true;
    }
/**
 * 写文件
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-22
 * @param string $path 日志文件路径
 * @param array $data 数据数组
 */
    protected function addLogToFile($path, $data)
    {
        file_put_contents($path, json_encode($data, JSON_UNESCAPED_UNICODE) . PHP_EOL, FILE_APPEND);
    }
 
/**
 * 从文件获取jobid
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-21
 * @return int
 */
    protected function loadLastJobIdFromFile()
    {
        $lastJobList = $this->tail($this->logJobsFile, 3);
        foreach (array_reverse($lastJobList) as $k => $v) {
            if (empty($v)) {
                continue;
            }
            $lastJob = json_decode($v, true);
            return $lastJob['job_id'];
        }
        return 0;
    }
 
/**
 * 获取文件的最后n行
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-21
 * @param string $file
 * @param int $num
 * @return array
 */
    private function tail($file, $num)
    {
        $fp = @fopen($file, 'r');
        if (empty($fp)) {
            return array();
        }
        $pos   = -2;
        $eof   = "";
        $head  = false; //当总行数小于Num时,判断是否到第一行了
        $lines = array();
        while ($num > 0) {
            while ($eof != "\n") {
                if (fseek($fp, $pos, SEEK_END) == 0) {
                    //fseek成功返回0,失败返回-1
                    $eof = fgetc($fp);
                    $pos--;
                } else {
                    //当到达第一行,行首时,设置$pos失败
                    fseek($fp, 0, SEEK_SET);
                    $head = true; //到达文件头部,开关打开
                    break;
                }
            }
            array_unshift($lines, fgets($fp));
//这一句,只能放上一句后,因为到文件头后,把第一行读取出来再跳出整个循环
            if ($head) {
                break;
            }
            $eof = "";
            $num--;
        }
        fclose($fp);
        return $lines;
    }
 
/**
 * 解析命令行参数
 * @author huangmin <andhm@126.com>
 * @version 1.0
 * @date 2018-08-22
 * @param array $argv
 * @return void
 */
    public function parseArgv($argv)
    {
        foreach ($argv as $k => $v) {
            if ($k == 0) {
                continue;
            }
            list($paramName, $paramValue) = explode('=', $v);
            if ($paramName == 'step') {
                $this->step = $paramValue;
            } elseif ($paramName == 'ak') {
                $this->ak = $paramValue;
            }
        }
    }
 
}
 
$app = new RepairJobAddress(__FILE__);
$app->parseArgv($argv);
$app->runAll();

以上是关于php 【脚本】万级别导出比对数据(省市区差异)的主要内容,如果未能解决你的问题,请参考以下文章

我想比较两个oracle数据库表结构的差异,有现成的工具或脚本吗

Php导出百万数据的优化

如何高效的导出 百万级别的数据量 到 Excel?

c#导出百万级别数据到Excel速度优化到一分钟之内

JAVA使用POI如何导出百万级别数据

JAVA使用POI如何导出百万级别数据