python 数据争夺ASsignment 2第2部分
Posted
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了python 数据争夺ASsignment 2第2部分相关的知识,希望对你有一定的参考价值。
# Code to import libraries as you need in this assessment, e.g.,
import csv
import time
import re
from tqdm import tqdm_notebook as tqdm
import dateutil.parser
import hashlib
import pandas as pd
import numpy as np
from nltk.corpus import brown
from datapackage import Package
t1_prob_matrix = {
'engineering jobs': { 'part-permanent':3.0,
'full-permanent':21.0,
'part-contract':31.0,
'full-contract':47.0 },
'accounting and finance jobs': { 'part-permanent':4.0,
'part-contract':12.0,
'full-permanent':28.0,
'full-contract':59.0},
'healthcare and nursing jobs': { 'part-contract':19.0,
'part-permanent':20.0,
'full-contract':23.0,
'full-permanent':39.0 },
'hospitality and catering jobs':{'part-permanent':4.0,
'part-contract':21.0,
'full-permanent':36.0,
'full-contract':42.0},
'it jobs': { 'part-permanent':2.0,
'full-permanent':13.0,
'part-contract':34.0,
'full-contract':53.0 },
'sales jobs':{ 'part-permanent':5.0,
'full-permanent':27.0,
'part-contract':29.0,
'full-contract':42.0},
'teaching jobs':{ 'part-permanent':5.0,
'part-contract':17.0,
'full-permanent':28.0,
'full-contract':53.0},
'pr advertising and marketing jobs': { 'part-permanent':4.0,
'part-contract':19.0,
'full-contract':35.0,
'full-permanent':44.0}
}
class Job:
def __init__(self):
self.id = 0
self.orig = []
self.location = ""
self.type = ""
self.salary = 0
self.source = ""
self.contract_type = ""
self.contract_time = ""
self.dt_two = None
self.dt_one = None
self.post_time = 0
self.salary_processed = False
self.title = ""
self.hash_val = 0
self.company = ""
#Trying to avoid deepcopy as its very slow
def clone(self):
new_copy = Job()
new_copy.id = self.id
new_copy.orig = self.orig
new_copy.location = self.location
new_copy.type = self.type
new_copy.salary = self.salary
new_copy.source = self.source
new_copy.contract_type = self.contract_type
new_copy.contract_time = self.contract_time
new_copy.dt_two = self.dt_two
new_copy.dt_one = self.dt_one
new_copy.post_time = self.post_time
new_copy.salary_processed = self.salary_processed
new_copy.title = self.title
new_copy.hash_val = self.hash_val
new_copy.company = self.company
return new_copy
_csv_map_ = {"Id": 0, "Title":1,"Location":2,"Company":3,"ContractType":4,"ContractTime":5,"Category":6,"Salary":7,"OpenDate":8,"CloseDate":9,"Source":10 }
package = Package('https://datahub.io/core/world-cities/datapackage.json')
word_list = brown.words()
word_set = set(word_list)
_country_filter_ = 'United Kingdom'
def read_input_file(f_name):
with open(f_name) as f:
reader = csv.reader(f)
data = [r for r in reader]
data.pop(0) # remove header
return data
def generate_city_reg():
r_rtr = "("
for k,v in cities.items():
r_rtr = r_rtr + k +"|"
r_rtr = r_rtr + "dummy)"
return r_rtr
#Utility functions
def read_cities():
for resource in package.resources:
if resource.descriptor['datahub']['type'] == 'derived/csv':
for rsr in resource.read():
if rsr[1] == _country_filter_:
cities[rsr[0].lower()] = rsr
def generate_id_hash(job_entry):
hash_string = job_entry.title + job_entry.company + str(job_entry.dt_two) + str(job_entry.dt_one) + job_entry.type
hash_val = hashlib.sha256(hash_string.encode()).hexdigest()
job_entry.hash_val = hash_val
return hash_val
def generate_id_hash_dict(job_entry):
hash_string = job_entry['Title'] + job_entry['Company'] + str(job_entry['CloseDate']) + str(job_entry['OpenDate']) + job_entry['Category']
hash_val = hashlib.sha256(hash_string.encode()).hexdigest()
return hash_val
#Parse the opendate and close date values, check if they are in the correct order
#if they are not swap them, track on post length for further filtering
def parse_time(job_entry, entry):
job_entry.dt_one = parse_date(entry[_csv_map_["OpenDate"]], job_entry)
ux_one = int(time.mktime(job_entry.dt_one.timetuple()))
job_entry.dt_two = parse_date(entry[_csv_map_["CloseDate"]], job_entry)
ux_two = int(time.mktime(job_entry.dt_two.timetuple()))
if ux_two < ux_one:
tmp = job_entry.dt_two
job_entry.dt_two = job_entry.dt_one
job_entry.dt_one = tmp
return job_entry
#Parse the date field, if an exception is thrown use a reg-ex reconstruct the field and return the tuple
def parse_date(val, job_entry):
try:
return dateutil.parser.parse(val)
except ValueError:
#print("Invalid Time Format ||- "+ val)
match_date = re.match(r'(\d{4})(\d{2})(\d{2})T(\d+)',val)
if match_date:
new_date = str(match_date.group(1))+str(match_date.group(3))+str(match_date.group(2))+"T"+match_date.group(4)
return dateutil.parser.parse(new_date)
def parse_title(job_title, city_reg):
proccessed = re.sub(r'\&','and', job_title)
proccessed = re.sub(r'([^\s\w]|_)+', '', proccessed)
proccessed = re.sub(' +',' ', proccessed)
matched_city = re.match(city_reg,proccessed)
if matched_city and (matched_city.group(1) not in word_set):
proccessed = re.sub(matched_city.group(1),'',proccessed)
return proccessed
def parse_company_name(company_name):
proccessed = re.sub(r'\&','and', company_name)
proccessed = re.sub(r'([^\s\w]|_)+', '', proccessed)
proccessed = re.sub(r'pty', '', proccessed)
proccessed = re.sub(r'ltd', '', proccessed)
proccessed = re.sub(r'limited', '', proccessed)
proccessed = re.sub(' +',' ', proccessed)
return proccessed
def parse_category(string_in):
proccessed = re.sub(r'\&','and', string_in)
proccessed = re.sub(r'([^\s\w]|_)+', '', proccessed)
proccessed = re.sub(' +',' ', proccessed)
return proccessed
def generate_time_val(time_val, category, contract_type):
if len(time_val) > 1:
return time_val
cat = t1_prob_matrix[category]
if contract_type == 'permanent':
if cat['part-permanent'] >= cat['full-permanent']:
return 'part_time'
else:
return 'full_time'
else:
if cat['part-contract'] >= cat['full-contract']:
return 'part_time'
else:
return 'full_time'
def generate_type_val(type_val, category, contract_time):
if len(type_val) > 1:
return type_val
cat = t1_prob_matrix[category]
if contract_time == 'full_time':
if cat['full-contract'] >= cat['full-permanent']:
return 'contract'
else:
return 'permanent'
else:
if cat['part-contract'] >= cat['part-permanent']:
return 'contract'
else:
return 'permanent'
def write_solution(write_list):
sol_out = open("dataset_integrated.csv","w")
sol_out.write("Id, Title, Location, Company, ContractType, ContractTime, Category, Salary, OpenDate, CloseDate, Source\n")
for job_key, job_entry in write_list.items():
str_out = str(job_entry.id)+","+job_entry.title.capitalize()+", "+job_entry.location.capitalize()+", "+job_entry.company.capitalize()
str_out = str_out + ", " + job_entry.contract_type + ", " + job_entry.contract_time + ", "+ job_entry.type.capitalize()
str_out = str_out + ", "+ str(job_entry.salary)+", " +job_entry.dt_one.strftime('%Y%m%dT%H%M%S')+", "+job_entry.dt_two.strftime('%Y%m%dT%H%M%S')
str_out = str_out + ", " + job_entry.source
sol_out.write(str_out + "\n")
cities = dict()
read_cities()
c_reg = generate_city_reg()
def process_data():
hashed_list = dict()
res_con = {'id_max':0, 'loaded_hash': hashed_list}
max_id = 0
data = read_input_file('dataset1_solution.csv')
progress = tqdm(total=len(data), desc="Reading/Processing Data")
categories = dict()
for entry in data:
progress.update(1)
job_entry = Job()
job_entry.orig = entry
job_entry.id = int(entry[_csv_map_["Id"]])
job_entry.title = entry[_csv_map_["Title"]].lower()
job_entry.location = entry[_csv_map_["Location"]].lower()
job_entry = parse_time(job_entry,entry)
job_entry.type = parse_category(entry[_csv_map_["Category"]].lower()).strip()
job_entry.source = entry[_csv_map_["Source"]].lower()
job_entry.contract_type = entry[_csv_map_["ContractType"]].lower()
job_entry.contract_time = entry[_csv_map_["ContractTime"]].lower()
job_entry.company = entry[_csv_map_["Company"]].lower()
job_entry.hash_val = generate_id_hash(job_entry)
job_entry.salary = float(entry[_csv_map_["Salary"]].lower())
hashed_list[job_entry.hash_val] = job_entry
if job_entry.type not in categories:
categories[job_entry.type] = True
if job_entry.id > max_id:
max_id = job_entry.id
res_con['id_max'] = max_id
res_con['categories'] = categories
return res_con
loaded_val = process_data()
def process_second_data_set(second_file):
ret_dict = dict()
cat_dic = dict()
job_list = []
data = read_input_file(second_file)
progress = tqdm(total=len(data), desc="Reading/Processing Data")
for entry in data:
val_read = dict()
val_read['OpenDate'] = dateutil.parser.parse(entry[0])
val_read['CloseDate'] = dateutil.parser.parse(entry[1])
val_read['Title'] = parse_title(entry[2].lower(),c_reg)
val_read['Company'] = parse_company_name(entry[3].lower()).strip()
val_read['Location'] = entry[4].lower()
val_read['Category'] = parse_category(entry[5].lower()).strip()
val_read['Salary'] = round(float(entry[6])*12)
val_read['ContractTime'] = entry[8].lower()
val_read['ContractType'] = entry[7].lower()
val_read['Source'] = second_file
val_read['Hash'] = 0
val_read['Id'] = 0
job_list.append(val_read)
progress.update(1)
if val_read['Category'] not in cat_dic:
cat_dic[val_read['Category']] = True
ret_dict['job_list'] = job_list
ret_dict['categories'] = cat_dic
progress.close()
return ret_dict
second_set = process_second_data_set("dataset2.csv")
print("Loaded Second Set Values")
print(second_set['categories'])
print("First Loaded Values")
print(loaded_val['categories'])
#based on the above output the following mapping dict is created
map_dict = { 'finance and accounting' : 'accounting and finance jobs',
'pr advertising and marketing' : 'pr advertising and marketing jobs',
'information technology' : 'it jobs'}
def map_generate_hash(mapping, input_set):
data = input_set['job_list']
progress = tqdm(total=len(data), desc="Remapping & Generating Hash")
for entry in data:
entry['Category'] = mapping[entry['Category']]
entry['Hash'] = generate_id_hash_dict(entry)
progress.update(1)
progress.close()
#Prevent duplication error
second_set = process_second_data_set("dataset2.csv")
map_generate_hash(map_dict,second_set)
#Add values and process removed duplicates
def process_and_merge(first_list, second_list):
hash_table = first_list['loaded_hash']
entry_id = first_list['id_max'] + 1
new_list = second_list['job_list']
progress = tqdm(total=len(new_list), desc="Merging")
dup_removed = 0
for item in new_list:
if item['Hash'] in hash_table:
dup_removed = dup_removed + 1
continue
n_job = Job()
n_job.id = entry_id
n_job.title = item['Title']
n_job.location = item['Location']
n_job.dt_one = item['OpenDate']
n_job.dt_two = item['CloseDate']
n_job.type = item['Category']
n_job.source = item['Source']
n_job.salary = item['Salary']
n_job.contract_type = generate_time_val(item['ContractType'], item['Category'], item['ContractTime'])
if n_job.contract_type == 'part time':
n_job.contract_type="part_time"
n_job.contract_time = generate_type_val(item['ContractTime'], item['Category'], item['ContractType'])
n_job.company = item['Company']
n_job.hash_val = item['Hash']
hash_table[item['Hash']] = n_job
progress.update(1)
entry_id = entry_id + 1
progress.close()
print("Removed: "+str(dup_removed))
process_and_merge(loaded_val,second_set)
write_solution(loaded_val['loaded_hash'])
以上是关于python 数据争夺ASsignment 2第2部分的主要内容,如果未能解决你的问题,请参考以下文章
Python 基础 - Day 2 Assignment - ShoppingCart 购物车程序
CJava和Python争夺第一,TIOBE CEO看好Python
python 报错TypeError: 'range' object does not support item assignment,解决方法
TIOBE 7 月编程语言排行榜:CJava 和 Python 争夺第一