php 【脚本】万级别导出比对数据(省市区差异)
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了php 【脚本】万级别导出比对数据(省市区差异)相关的知识,希望对你有一定的参考价值。
<?php
/**
* 修复简历地址和地图标记不一致(重点在筛选出要修复的数据)
* 1. 找出简历
* 2. 根据坐标获取百度地图详细地址
* 3. 比对省、市、区 找出差异较大的简历
* 4. 保存成指定格式文件
*
* usage: php repair_job_address.php step=1 ak=dASz7ubuSpHidP1oQWKuAK3q
*
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-21
* @copyright 2018 goodjobs.cn
*/
require dirname(__FILE__) . '/../webapp/config.php';
require_once AG_APP_DIR . '/agavi.php';
require_once AG_DOC_ROOT . '/lib/BaseCron.php';
class RepairJobAddress extends BaseCron
{
/**
* 保存职位信息(包括 通过经纬度获取的百度地图地址信息)到文件
*/
const STEP_STORE_JOB = 1;
/**
* 从保存的职位记录里根据一定的策略刷选出
* 职位地址和标记的百度坐标地址不一致的记录,
* 保存到文件
*/
const STEP_MATCH_ADDRESS = 2;
/**
* 将不一致的记录保存成csv格式
*/
const STEP_STORE_TO_EXCEL = 3;
/**
* 执行步骤
* @var int
*/
protected $step;
/**
* 百度地图调用key
* @var string
*/
protected $ak;
/**
* 错误码
* @var int
*/
protected $errno;
/**
* 错误信息
* @var string
*/
protected $error;
/**
* 日志路径
* @var string
*/
protected $logDir;
/**
* 职位信息记录
* @var string
*/
protected $logJobsFile;
/**
* 职位地址和百度地图标记地址不匹配记录
* @var string
*/
protected $logDiffFile;
/**
* 职位地址和百度地图标记地址不匹配记录,生成excel
* @var string
*/
protected $logDiffExcelFile;
public function __construct($cronFile)
{
$this->step = self::STEP_STORE_JOB;
$this->ak = 'Z2MIXNix38i4SwOC3oAHz3dP4OYlQ6kN';
// 确保该文件夹存在
$this->logDir = AG_DOC_ROOT . '/var/logs/http_to_https/';
$this->logJobsFile = $this->logDir . 'jobaddress.log';
$this->logDiffFile = $this->logDir . 'jobaddress_diff.log';
$this->logDiffExcelFile = $this->logDir . 'jobaddress_diff.csv';
parent::__construct($cronFile);
}
/**
* 业务脚本执行核心逻辑
* @return void
*/
protected function run()
{
switch ($this->step) {
case self::STEP_STORE_JOB:
if (!$this->storeJobInfoAndBaiduAddress()) {
echo $this->error . PHP_EOL;
}
break;
case self::STEP_MATCH_ADDRESS:
if (!$this->checkMatchingFromFile()) {
echo $this->error . PHP_EOL;
}
break;
case self::STEP_STORE_TO_EXCEL:
if (!$this->storeJobInfoToExcel()) {
echo $this->error . PHP_EOL;
}
break;
default:
echo 'Unknown step ' . $this->step . PHP_EOL;
break;
}
echo 'STEP ' . $this->step . ' done.' . PHP_EOL;
}
/**
* 使用空逻辑覆盖父类
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-14
* @return void
*/
protected function beforeRun()
{
}
/**
* 使用空逻辑覆盖父类
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-14
* @return void
*/
protected function afterRun()
{
}
/**
* 保存职位信息和对应的百度地图地址
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-22
* @return bool
*/
protected function storeJobInfoAndBaiduAddress()
{
$batchNum = 1000;
$lastJobId = $this->loadLastJobIdFromFile();
$today = date('Y-m-d');
$batchTimes = 0;
do {
$sql = sprintf("select JOB.job_id, JOB.job_alias_name, JOB.province, JOB.city, JOB.district, JOB.address, JOB.linkman, JOB.phone, JOB.mem_corp_id, JOB.account_id, JOB.corp_id, JOB.map_lat, JOB.map_lng, CORP.corp_name from t_jobs as JOB left join t_corp_info as CORP ON JOB.corp_id=CORP.corp_id where JOB.job_status='NORMAL' and JOB.date_end>%s and JOB.job_id>%u order by JOB.job_id asc limit %u", "'" . $today . "'", $lastJobId, $batchNum);
try {
$stmt = $this->dbConnection->prepareStatement($sql);
$rs = $stmt->executeQuery(ResultSet::FETCHMODE_ASSOC);
} catch (Exception $ex) {
$this->error = 'sql exception: ' . $ex->getMessage();
return false;
}
echo 'SQL: ' . $this->dbConnection->lastQuery . PHP_EOL;
while ($rs->next()) {
$jobId = $rs->getInt('job_id');
$jobAliasName = $rs->getString('job_alias_name');
$province = $rs->getInt('province');
$city = $rs->getInt('city');
$district = $rs->getInt('district');
$address = $rs->getString('address');
$linkman = $rs->getString('linkman');
$phone = $rs->getString('phone');
$memCorpId = $rs->getInt('mem_corp_id');
$accountId = $rs->getInt('account_id');
$corpId = $rs->getInt('corp_id');
$mapLat = $rs->getString('map_lat');
$mapLng = $rs->getString('map_lng');
$corpName = $rs->getString('corp_name');
if (!empty($mapLat) && !empty($mapLng)) {
// 抓取百度地图坐标地址
$addressFromBd = $this->getAddressFromBaiduMap($mapLat . ',' . $mapLng);
if ($addressFromBd === false) {
return false;
}
} else {
$addressFromBd = null;
}
$this->addLogToFile($this->logJobsFile, [
'job_id' => $jobId,
'job_alias_name' => $jobAliasName,
'corp_id' => $corpId,
'corp_name' => $corpName,
'mem_corp_id' => $memCorpId,
'account_id' => $accountId,
'linkman' => $linkman,
'phone' => $phone,
'address' => $address,
'province' => $province,
'city' => $city,
'district' => $district,
'map_lat' => $mapLat,
'map_lng' => $mapLng,
'baidu_address' => $addressFromBd,
]);
$lastJobId = $jobId;
}
if ($batchTimes++ > 1) {
// for debug
// break;
}
} while ($rs->getRecordCount() == $batchNum);
return true;
}
/**
* 根据坐标获取百度地图地址信息
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-21
* @param string $location 坐标
* @return bool|array
*/
protected function getAddressFromBaiduMap($location)
{
$url = sprintf('http://api.map.baidu.com/geocoder/v2/?location=%s&output=json&ak=%s', $location, $this->ak);
$curl = curl_init();
curl_setopt($curl, CURLOPT_URL, $url);
curl_setopt($curl, CURLOPT_RETURNTRANSFER, 1);
$addressFromBd = curl_exec($curl);
if (curl_errno($curl)) {
$this->error = 'curl error: ' . curl_error($curl) . '; location: ' . $location;
curl_close($curl);
return false;
}
curl_close($curl);
$addressFromBd = json_decode($addressFromBd, true);
if (empty($addressFromBd)) {
$this->error = 'json_decode error: ' . var_export($addressFromBd, true) . '; location: ' . $location;
return false;
}
if ($addressFromBd['status'] != 0) {
$this->error = 'baidu map api return error: ' . var_export($addressFromBd, true) . '; location: ' . $location;
return false;
}
if (empty($addressFromBd['result']['formatted_address'])) {
// $this->error = 'baidu map api return empty address: '.var_export($addressFromBd, true).'; location: '.$location;
// return false;
}
return ['formatted_address' => $addressFromBd['result']['formatted_address'],
'province' => $addressFromBd['result']['addressComponent']['province'],
'city' => $addressFromBd['result']['addressComponent']['city'],
'district' => $addressFromBd['result']['addressComponent']['district'],
'street' => $addressFromBd['result']['addressComponent']['street'],
'street_number' => $addressFromBd['result']['addressComponent']['street_number'],
];
}
/**
* 检测文件内的地址信息匹配情况,生成新的文件
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-22
* @return bool
*/
protected function checkMatchingFromFile()
{
$fp = @fopen($this->logJobsFile, 'r');
if (empty($fp)) {
$this->error = 'job file not exists';
return false;
}
// 用于缓存从数据库里取出的数据,省 市 区
$typeCache = [];
// 从文件中读取数据,一行一行的读
while (!feof($fp)) {
$line = fgets($fp);
$jobInfo = json_decode($line, true);
if (empty($jobInfo)) {
// 成功
break;
}
if (is_null($jobInfo['baidu_address'])) {
// 没有填写经纬度,不考虑
continue;
}
// sql条件 in
$inSet = [];
// 地区映射关系
$typeSet = [];
if ($jobInfo['province']) {
if (isset($typeCache[$jobInfo['province']])) {
$typeSet[$jobInfo['province']] = $typeCache[$jobInfo['province']];
} else {
$inSet[] = intval($jobInfo['province']);
}
}
if ($jobInfo['city']) {
// 对巢湖兼容
if ($jobInfo['city'] == 1046) {
$jobInfo['city'] = 1043;
$jobInfo['district'] = 5563;
}
if (isset($typeCache[$jobInfo['city']])) {
$typeSet[$jobInfo['city']] = $typeCache[$jobInfo['city']];
} else {
$inSet[] = intval($jobInfo['city']);
}
}
if ($jobInfo['district']) {
if (isset($typeCache[$jobInfo['district']])) {
$typeSet[$jobInfo['district']] = $typeCache[$jobInfo['district']];
} else {
$inSet[] = intval($jobInfo['district']);
}
}
if (empty($inSet) && empty($typeSet)) {
$this->error = 'province city district all empty';
return false;
}
if (!empty($inSet)) {
$sql = sprintf("select type_id, type_name from t_types where type_id in(%s)", implode(',', $inSet));
try {
$stmt = $this->dbConnection->prepareStatement($sql);
$rs = $stmt->executeQuery(ResultSet::FETCHMODE_ASSOC);
} catch (Exception $ex) {
$this->error = 'sql exception: ' . $ex->getMessage();
return false;
}
echo 'SQL: ' . $this->dbConnection->lastQuery . PHP_EOL;
while ($rs->next()) {
$typeSet[$rs->getInt('type_id')] = $rs->getString('type_name');
$typeCache[$rs->getInt('type_id')] = $rs->getString('type_name');
}
}
$goodjobsAddress = [
'province' => isset($typeSet[$jobInfo['province']]) ? $typeSet[$jobInfo['province']] : '',
'city' => isset($typeSet[$jobInfo['city']]) ? $typeSet[$jobInfo['city']] : '',
'district' => isset($typeSet[$jobInfo['district']]) ? $typeSet[$jobInfo['district']] : '',
'address' => $jobInfo['address'],
];
$baiduAddress = [
'province' => $jobInfo['baidu_address']['province'],
'city' => $jobInfo['baidu_address']['city'],
'district' => $jobInfo['baidu_address']['district'],
'address' => $jobInfo['baidu_address']['formatted_address'],
];
$matchResult = $this->checkMatching($goodjobsAddress, $baiduAddress);
// 匹配,无需操作
if ($matchResult['status'] == 0) {
continue;
}
$jobInfo['province'] = $goodjobsAddress['province'];
$jobInfo['city'] = $goodjobsAddress['city'];
$jobInfo['district'] = $goodjobsAddress['district'];
$jobInfo['reason'] = $matchResult['reason'];
$this->addLogToFile($this->logDiffFile, $jobInfo);
}
fclose($fp);
return true;
}
/**
* 检测匹配
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-22
* @param array $goodjobsAddress 职位中的地址
* @param array $baiduAddress 职位中标记的地址
* @return array
*/
protected function checkMatching($goodjobsAddress, $baiduAddress)
{
$reason = '';
if ($goodjobsAddress['province'] != $baiduAddress['province']) {
$reason = '省 不匹配';
} elseif ($goodjobsAddress['city'] != $baiduAddress['city']) {
$reason = '市 不匹配';
} elseif (!empty($goodjobsAddress['district']) && $goodjobsAddress['district'] != $baiduAddress['district']) {
$reason = '区/县 不匹配';
} else {
if (empty($goodjobsAddress['district']) &&
!empty($baiduAddress['district']) &&
!empty($goodjobsAddress['address'])) {
// 简历地址中出现了百度地图的 区
if (!preg_match('#' . $baiduAddress['district'] . '#i', $goodjobsAddress['address'])) {
$reason = '区/县 可能不匹配';
}
}
}
if ($reason) {
return ['status' => 1, 'reason' => $reason];
}
return ['status' => 0, 'reason' => ''];
}
protected function storeJobInfoToExcel()
{
$titleArr = ['职位ID', '职位名称', '所属公司', '联系人', '联系电话', '省', '市', '区', '具体地址', '省(百度)', '市(百度)', '区(百度)', '具体地址(百度)', '不匹配原因'];
$jobArr = [];
$fp = @fopen($this->logDiffFile, 'r');
if (empty($fp)) {
$this->error = 'job diff file not exists';
return false;
}
// 从文件中读取数据,一行一行的读
while (!feof($fp)) {
$line = fgets($fp);
$jobInfo = json_decode($line, true);
if (empty($jobInfo)) {
// 成功
break;
}
$jobArr[] = [
$jobInfo['job_id'],
$jobInfo['job_alias_name'],
$jobInfo['corp_name'],
$jobInfo['linkman'],
$jobInfo['phone'],
$jobInfo['province'],
$jobInfo['city'],
$jobInfo['district'],
$jobInfo['address'],
$jobInfo['baidu_address']['province'],
$jobInfo['baidu_address']['city'],
$jobInfo['baidu_address']['district'],
$jobInfo['baidu_address']['formatted_address'],
$jobInfo['reason'],
];
}
fclose($fp);
$fp = @fopen($this->logDiffExcelFile, 'a');
fputcsv($fp, $titleArr);
foreach ($jobArr as $i => $job) {
fputcsv($fp, $job);
}
fclose($fp);
return true;
}
/**
* 写文件
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-22
* @param string $path 日志文件路径
* @param array $data 数据数组
*/
protected function addLogToFile($path, $data)
{
file_put_contents($path, json_encode($data, JSON_UNESCAPED_UNICODE) . PHP_EOL, FILE_APPEND);
}
/**
* 从文件获取jobid
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-21
* @return int
*/
protected function loadLastJobIdFromFile()
{
$lastJobList = $this->tail($this->logJobsFile, 3);
foreach (array_reverse($lastJobList) as $k => $v) {
if (empty($v)) {
continue;
}
$lastJob = json_decode($v, true);
return $lastJob['job_id'];
}
return 0;
}
/**
* 获取文件的最后n行
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-21
* @param string $file
* @param int $num
* @return array
*/
private function tail($file, $num)
{
$fp = @fopen($file, 'r');
if (empty($fp)) {
return array();
}
$pos = -2;
$eof = "";
$head = false; //当总行数小于Num时,判断是否到第一行了
$lines = array();
while ($num > 0) {
while ($eof != "\n") {
if (fseek($fp, $pos, SEEK_END) == 0) {
//fseek成功返回0,失败返回-1
$eof = fgetc($fp);
$pos--;
} else {
//当到达第一行,行首时,设置$pos失败
fseek($fp, 0, SEEK_SET);
$head = true; //到达文件头部,开关打开
break;
}
}
array_unshift($lines, fgets($fp));
//这一句,只能放上一句后,因为到文件头后,把第一行读取出来再跳出整个循环
if ($head) {
break;
}
$eof = "";
$num--;
}
fclose($fp);
return $lines;
}
/**
* 解析命令行参数
* @author huangmin <andhm@126.com>
* @version 1.0
* @date 2018-08-22
* @param array $argv
* @return void
*/
public function parseArgv($argv)
{
foreach ($argv as $k => $v) {
if ($k == 0) {
continue;
}
list($paramName, $paramValue) = explode('=', $v);
if ($paramName == 'step') {
$this->step = $paramValue;
} elseif ($paramName == 'ak') {
$this->ak = $paramValue;
}
}
}
}
$app = new RepairJobAddress(__FILE__);
$app->parseArgv($argv);
$app->runAll();
以上是关于php 【脚本】万级别导出比对数据(省市区差异)的主要内容,如果未能解决你的问题,请参考以下文章