python 数据争夺ASsignment 2第2部分

Posted 2021-05-09
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python 数据争夺ASsignment 2第2部分相关的知识，希望对你有一定的参考价值。
# Code to import libraries as you need in this assessment, e.g.,
import csv
import time
import re
from tqdm import tqdm_notebook as tqdm
import dateutil.parser
import hashlib
import pandas as pd
import numpy as np
from nltk.corpus import brown
from datapackage import Package

t1_prob_matrix = {
    'engineering jobs': {  'part-permanent':3.0,
                        'full-permanent':21.0,
                        'part-contract':31.0,
                        'full-contract':47.0 },
    'accounting and finance jobs': { 'part-permanent':4.0,
                                'part-contract':12.0,
                                'full-permanent':28.0,
                                'full-contract':59.0},
 'healthcare and nursing jobs': { 'part-contract':19.0,
               'part-permanent':20.0,
               'full-contract':23.0,
               'full-permanent':39.0 },
 'hospitality and catering jobs':{'part-permanent':4.0, 
              'part-contract':21.0,
              'full-permanent':36.0,
              'full-contract':42.0},
 'it jobs': { 'part-permanent':2.0,
              'full-permanent':13.0,
              'part-contract':34.0, 
              'full-contract':53.0 },
 'sales jobs':{ 'part-permanent':5.0,
               'full-permanent':27.0,
               'part-contract':29.0,
               'full-contract':42.0},
 'teaching jobs':{ 'part-permanent':5.0,
               'part-contract':17.0,
               'full-permanent':28.0,
               'full-contract':53.0},
 'pr advertising and marketing jobs': { 'part-permanent':4.0,
              'part-contract':19.0, 
              'full-contract':35.0, 
              'full-permanent':44.0}
}

class Job:
    def __init__(self):
        self.id = 0
        self.orig = []
        self.location = ""
        self.type = ""
        self.salary = 0
        self.source = ""
        self.contract_type  = ""
        self.contract_time  = ""
        self.dt_two = None
        self.dt_one = None
        self.post_time = 0
        self.salary_processed = False
        self.title = ""
        self.hash_val = 0
        self.company = ""
    #Trying to avoid deepcopy as its very slow
    def clone(self):
        new_copy = Job()
        new_copy.id =  self.id
        new_copy.orig = self.orig 
        new_copy.location = self.location 
        new_copy.type = self.type 
        new_copy.salary = self.salary
        new_copy.source = self.source 
        new_copy.contract_type  = self.contract_type 
        new_copy.contract_time  = self.contract_time 
        new_copy.dt_two = self.dt_two 
        new_copy.dt_one = self.dt_one
        new_copy.post_time = self.post_time 
        new_copy.salary_processed = self.salary_processed 
        new_copy.title = self.title
        new_copy.hash_val = self.hash_val
        new_copy.company = self.company 
        return new_copy
    
_csv_map_ = {"Id": 0, "Title":1,"Location":2,"Company":3,"ContractType":4,"ContractTime":5,"Category":6,"Salary":7,"OpenDate":8,"CloseDate":9,"Source":10 }
package = Package('https://datahub.io/core/world-cities/datapackage.json')
word_list = brown.words()
word_set = set(word_list)
_country_filter_  = 'United Kingdom'

def read_input_file(f_name):
    with open(f_name) as f:
        reader = csv.reader(f)
        data = [r for r in reader]
        data.pop(0) # remove header
    return data

def generate_city_reg():
    r_rtr = "("
    for k,v in cities.items():
        r_rtr = r_rtr + k +"|"
    r_rtr = r_rtr + "dummy)"
    return r_rtr

#Utility functions
def read_cities():
    for resource in package.resources:
        if resource.descriptor['datahub']['type'] == 'derived/csv':
            for rsr in resource.read():
                if rsr[1] == _country_filter_:
                    cities[rsr[0].lower()] = rsr


def generate_id_hash(job_entry):
    hash_string = job_entry.title + job_entry.company  + str(job_entry.dt_two) + str(job_entry.dt_one) + job_entry.type
    hash_val = hashlib.sha256(hash_string.encode()).hexdigest() 
    job_entry.hash_val = hash_val
    return hash_val
    
def generate_id_hash_dict(job_entry):
    hash_string = job_entry['Title'] + job_entry['Company'] + str(job_entry['CloseDate']) + str(job_entry['OpenDate']) + job_entry['Category']
    hash_val = hashlib.sha256(hash_string.encode()).hexdigest() 
    return hash_val

#Parse the opendate and close date values, check if they are in the correct order
#if they are not swap them, track on post length for further filtering
def parse_time(job_entry, entry):

    job_entry.dt_one = parse_date(entry[_csv_map_["OpenDate"]], job_entry)
    ux_one = int(time.mktime(job_entry.dt_one.timetuple()))
  
    job_entry.dt_two = parse_date(entry[_csv_map_["CloseDate"]], job_entry) 
    ux_two = int(time.mktime(job_entry.dt_two.timetuple()))
    
    if ux_two < ux_one:
        tmp = job_entry.dt_two
        job_entry.dt_two = job_entry.dt_one
        job_entry.dt_one = tmp
    return job_entry

#Parse the date field, if an exception is thrown use a reg-ex reconstruct the field and return the tuple
def parse_date(val, job_entry):
    try:
        return dateutil.parser.parse(val)
    except ValueError:
        #print("Invalid Time Format ||- "+ val)
        match_date = re.match(r'(\d{4})(\d{2})(\d{2})T(\d+)',val)   
        if match_date:
            new_date = str(match_date.group(1))+str(match_date.group(3))+str(match_date.group(2))+"T"+match_date.group(4)
            return dateutil.parser.parse(new_date)
        
def parse_title(job_title,  city_reg):
    proccessed = re.sub(r'\&','and', job_title)
    proccessed = re.sub(r'([^\s\w]|_)+', '', proccessed)
    proccessed = re.sub(' +',' ', proccessed)
    matched_city = re.match(city_reg,proccessed)
    if matched_city and (matched_city.group(1) not in word_set):
        proccessed = re.sub(matched_city.group(1),'',proccessed)
    return proccessed

def parse_company_name(company_name):
    proccessed = re.sub(r'\&','and', company_name)
    proccessed = re.sub(r'([^\s\w]|_)+', '', proccessed)
    proccessed = re.sub(r'pty', '', proccessed)
    proccessed = re.sub(r'ltd', '', proccessed)
    proccessed = re.sub(r'limited', '', proccessed)
    proccessed = re.sub(' +',' ', proccessed)
    return proccessed

def parse_category(string_in):
    proccessed = re.sub(r'\&','and', string_in)
    proccessed = re.sub(r'([^\s\w]|_)+', '', proccessed)
    proccessed = re.sub(' +',' ', proccessed)
    return proccessed

def generate_time_val(time_val, category, contract_type):
    if len(time_val) > 1:
        return time_val
    cat = t1_prob_matrix[category]
    if contract_type == 'permanent':
        if cat['part-permanent'] >= cat['full-permanent']:
            return 'part_time'
        else:
            return 'full_time'
        
    else:
        if cat['part-contract'] >= cat['full-contract']:
            return 'part_time'
        else:
            return 'full_time'

def generate_type_val(type_val, category, contract_time):
    if len(type_val) > 1:
        return type_val
    cat = t1_prob_matrix[category]
    if contract_time == 'full_time':
        if cat['full-contract'] >= cat['full-permanent']:
            return 'contract'
        else:
            return 'permanent'
        
    else:
        if cat['part-contract'] >= cat['part-permanent']:
            return 'contract'
        else:
            return 'permanent'     

def write_solution(write_list):
    sol_out  = open("dataset_integrated.csv","w")
    sol_out.write("Id, Title, Location, Company, ContractType, ContractTime, Category, Salary, OpenDate, CloseDate, Source\n")
    for job_key, job_entry in write_list.items():
        str_out = str(job_entry.id)+","+job_entry.title.capitalize()+", "+job_entry.location.capitalize()+", "+job_entry.company.capitalize()
        str_out = str_out + ", " + job_entry.contract_type + ", " + job_entry.contract_time + ", "+ job_entry.type.capitalize()
        str_out = str_out + ", "+ str(job_entry.salary)+", " +job_entry.dt_one.strftime('%Y%m%dT%H%M%S')+", "+job_entry.dt_two.strftime('%Y%m%dT%H%M%S') 
        str_out = str_out + ", " + job_entry.source
        sol_out.write(str_out + "\n")
        
cities = dict()
read_cities()
c_reg = generate_city_reg()



def process_data():
    hashed_list = dict()
    res_con = {'id_max':0, 'loaded_hash': hashed_list}
    max_id = 0
    data = read_input_file('dataset1_solution.csv')
    progress = tqdm(total=len(data), desc="Reading/Processing Data")
    categories = dict()
    for entry in data:
        progress.update(1)
        job_entry = Job()      
        job_entry.orig = entry
        job_entry.id = int(entry[_csv_map_["Id"]])
        job_entry.title =  entry[_csv_map_["Title"]].lower()
        job_entry.location = entry[_csv_map_["Location"]].lower()
        job_entry = parse_time(job_entry,entry)
        job_entry.type = parse_category(entry[_csv_map_["Category"]].lower()).strip()
        job_entry.source = entry[_csv_map_["Source"]].lower()
        job_entry.contract_type = entry[_csv_map_["ContractType"]].lower()
        job_entry.contract_time = entry[_csv_map_["ContractTime"]].lower()
        job_entry.company = entry[_csv_map_["Company"]].lower()
        job_entry.hash_val = generate_id_hash(job_entry)
        job_entry.salary = float(entry[_csv_map_["Salary"]].lower())
        hashed_list[job_entry.hash_val] = job_entry
        if job_entry.type not in categories:
            categories[job_entry.type] = True
        if job_entry.id > max_id:
            max_id = job_entry.id
    res_con['id_max'] = max_id
    res_con['categories'] = categories
    return res_con

loaded_val = process_data()


def process_second_data_set(second_file):
    ret_dict = dict()
    cat_dic = dict()
    job_list = []
    data = read_input_file(second_file)
    progress = tqdm(total=len(data), desc="Reading/Processing Data")
    for entry in data:
        val_read = dict()
        val_read['OpenDate']  = dateutil.parser.parse(entry[0])
        val_read['CloseDate'] = dateutil.parser.parse(entry[1])
        val_read['Title'] = parse_title(entry[2].lower(),c_reg)
        val_read['Company'] = parse_company_name(entry[3].lower()).strip()
        val_read['Location'] = entry[4].lower()
        val_read['Category'] = parse_category(entry[5].lower()).strip()
        val_read['Salary'] = round(float(entry[6])*12)
        val_read['ContractTime'] = entry[8].lower() 
        val_read['ContractType'] = entry[7].lower()
        val_read['Source'] = second_file
        val_read['Hash'] = 0
        val_read['Id'] = 0
        job_list.append(val_read)
        progress.update(1)
        if val_read['Category'] not in cat_dic:
            cat_dic[val_read['Category']] = True
    ret_dict['job_list'] = job_list
    ret_dict['categories'] = cat_dic
    progress.close()
    return ret_dict
        
second_set = process_second_data_set("dataset2.csv")    
print("Loaded Second Set Values")
print(second_set['categories'])
print("First Loaded Values")
print(loaded_val['categories'])


#based on the above output the following mapping dict is created
map_dict = { 'finance and accounting'       : 'accounting and finance jobs', 
             'pr advertising and marketing' : 'pr advertising and marketing jobs',
             'information technology'       : 'it jobs'}



def map_generate_hash(mapping, input_set):
    data = input_set['job_list']
    progress = tqdm(total=len(data), desc="Remapping & Generating Hash")
    for entry in data:
        entry['Category'] = mapping[entry['Category']]
        entry['Hash'] = generate_id_hash_dict(entry)
        progress.update(1)
    progress.close()

#Prevent duplication error
second_set = process_second_data_set("dataset2.csv")    
map_generate_hash(map_dict,second_set)

#Add values and process removed duplicates
def process_and_merge(first_list, second_list):
    hash_table = first_list['loaded_hash']
    entry_id = first_list['id_max'] + 1
    new_list = second_list['job_list']
    progress = tqdm(total=len(new_list), desc="Merging")
    dup_removed = 0
    for item in new_list:
        if item['Hash'] in hash_table:
            dup_removed = dup_removed + 1
            continue
        n_job = Job()
        n_job.id = entry_id
        n_job.title =  item['Title']
        n_job.location = item['Location'] 
        n_job.dt_one = item['OpenDate']
        n_job.dt_two = item['CloseDate']
        n_job.type = item['Category']
        n_job.source = item['Source']
        n_job.salary = item['Salary']
        n_job.contract_type = generate_time_val(item['ContractType'], item['Category'], item['ContractTime'])
        if n_job.contract_type == 'part time':
            n_job.contract_type="part_time"
        n_job.contract_time = generate_type_val(item['ContractTime'], item['Category'], item['ContractType'])
        n_job.company =  item['Company']
        n_job.hash_val = item['Hash']
        hash_table[item['Hash']] = n_job
        progress.update(1)
        entry_id = entry_id + 1
    progress.close()
    print("Removed: "+str(dup_removed))
    
    
process_and_merge(loaded_val,second_set)


write_solution(loaded_val['loaded_hash'])
以上是关于python 数据争夺ASsignment 2第2部分的主要内容，如果未能解决你的问题，请参考以下文章
Python 基础 - Day 2 Assignment - ShoppingCart 购物车程序
CJava和Python争夺第一，TIOBE CEO看好Python
python 报错TypeError: 'range' object does not support item assignment，解决方法
TIOBE 7 月编程语言排行榜：CJava 和 Python 争夺第一
Python 基础 - Day 5 Assignment - ATM
Python 基础 - Day 1 Assignment - Login 模拟登陆