札记Python处理TSV文件以及144790个英语单词的注音释义例句的.sql和.tsv文件下载

Posted u25th_engineer

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了札记Python处理TSV文件以及144790个英语单词的注音释义例句的.sql和.tsv文件下载相关的知识,希望对你有一定的参考价值。

  详情参见GitHub仓库:

  链接


  两年前的老东西了,这个周末闲得无聊又拿出来随便玩儿下。

  之前的博文:

  博文1

  博文2

  博文3



import csv
import re
import time
import operator


# fp = csv.writer (fp, delimiter='\\t', quoting=csv.QUOTE_NONE, quotechar=None, escapechar="|")

def write_to_tsv(output_path: str, file_columns: list, data: list):
    csv.register_dialect('tsv_dialect', delimiter='\\t', quoting=csv.QUOTE_NONE, quotechar=None, escapechar="|")
    wf = open(output_path, 'w', newline='', encoding='utf-8-sig')
    # with open(output_path, 'w', newline='', encoding='utf-8-sig') as wf:
    writer = csv.DictWriter(wf, fieldnames=file_columns, dialect='tsv_dialect')
    writer.writerows(data)
    csv.unregister_dialect('tsv_dialect')
    wf.close()


def read_from_tsv(file_path: str, column_names: list) -> list:
    csv.register_dialect('tsv_dialect', delimiter='\\t', quoting=csv.QUOTE_ALL)
    with open(file_path, 'r', encoding = 'utf8') as wf:
        reader = csv.DictReader(wf, fieldnames=column_names, dialect='tsv_dialect')
        data_list = []
        for row in reader:
            data = dict(row)
            data_list.append(data)
    csv.unregister_dialect('tsv_dialect')
    wf.close()
    return data_list


def extract_line_with_pronunciation(file_path: str, file_path_new_1: str, file_path_new_2: str):
    wf = open(file_path, 'r', encoding='utf8')
    nf1 = open(file_path_new_1, 'w+', encoding='utf8')
    nf2 = open(file_path_new_2, 'w+', encoding='utf8')
    line = wf.readline()
    pattern_1 = re.compile('\\t美\\[')
    pattern_2 = re.compile('\\t英\\[')
    while line:
        res_1 = pattern_1.search(line)
        res_2 = pattern_2.search(line)
        if res_1 or res_2:
            nf1.write(line)
        else:
            nf2.write(line)
        line = wf.readline()
    nf2.close()
    nf1.close()
    wf.close()


def split_pronunciation_line(file_path: str, column_names: list) -> list:
    data_list = read_from_tsv(file_path, column_names)  # ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES']
    pattern_1 = re.compile('美\\[.*英.*\\][,]')
    pattern_2 = re.compile('英\\[.*\\][,]')
    pattern_3 = re.compile('美\\[.*\\][,]')
    set_usa_un = set()
    set_un = set()
    set_usa = set()
    id_pronunciation = dict()
    id_meaning = dict()
    complete_format = []

    for column in data_list:
        pm = column['WORD_MEANINGS']
        res_1 = pattern_1.search(pm)
        res_2 = pattern_2.search(pm)
        res_3 = pattern_3.search(pm)
        id = int(column['WORD_ID'])

        if res_1:
            set_usa_un.add(id)
            id_pronunciation[id] = res_1.group()
            id_meaning[id] = str(pm).replace(res_1.group(), '')
        elif res_2 and (id not in set_usa_un):  # and (id not in set_usa):
            set_un.add(id)
            id_pronunciation[id] = res_2.group()
            id_meaning[id] = str(pm).replace(res_2.group(), '')
        elif res_3 and (id not in set_usa_un):  # and (id not in set_un):
            set_usa.add(id)
            id_pronunciation[id] = res_3.group()
            id_meaning[id] = str(pm).replace(res_3.group(), '')

        if id in set_usa_un:
            item = 'id': id, 'word': column['SINGLE_WORD'], 'pronunciation': id_pronunciation[id],
                    'meaning': id_meaning[id], 'EXAMPLE_sentence': column['EXAMPLE_SENTENCES']
        elif id in set_un:
            item = 'id': id, 'word': column['SINGLE_WORD'], 'pronunciation': id_pronunciation[id],
                    'meaning': id_meaning[id], 'EXAMPLE_sentence': column['EXAMPLE_SENTENCES']
        elif id in set_usa:
            item = 'id': id, 'word': column['SINGLE_WORD'], 'pronunciation': id_pronunciation[id],
                    'meaning': id_meaning[id], 'example_sentence': column['EXAMPLE_SENTENCES']

        complete_format.append(item)

    return complete_format


def split_pronunciation_line_no_pronunciation(file_path: str, column_names: list) -> list:
    data_list = read_from_tsv(file_path, column_names)
    complete_format = []
    for column in data_list:
        if column['WORD_ID'] == '\\ufeff9':
            column['WORD_ID'] = '9'
        '''
        # The following item is wrong for 'example_sentences' is not UPPERCASE but lowercase.
        item = 'id': column['WORD_ID'], 'word': column['SINGLE_WORD'], 'pronunciation': '',
                'meaning': column['WORD_MEANINGS'], 'example_sentences': column['EXAMPLE_SENTENCES']
        '''
        item = 'id': column['WORD_ID'], 'word': column['SINGLE_WORD'], 'pronunciation': '',
                'meaning': column['WORD_MEANINGS'], 'EXAMPLE_SENTENCES': column['EXAMPLE_SENTENCES']
        complete_format.append(item)
    # print(complete_format)

    # complete_format = ['id': '9', 'word': 'grimoires', 'pronunciation': '网络释义: 魔法之书;中世纪巫术之书;魔典;',
    #                     'meaning': '', 'EXAMPLE_SENTENCES': '',
    #                    'id': '10', 'word': 'subreptitious', 'pronunciation': '网络释义: 隐瞒事实的;',
    #                     'meaning': '', 'EXAMPLE_SENTENCES': '']
    return complete_format


def combine_tsv_with_same_headers(file_path_1: str, file_path_2: str, column_names: list) -> list:
    data_list_1 = read_from_tsv(file_path_1, column_names)
    data_list_2 = read_from_tsv(file_path_2, column_names)
    complete_format = []
    for column in data_list_1:
        item = 'id': column['id'], 'word': column['word'], 'pronunciation': column['pronunciation'],
                'meaning': column['meaning'], 'EXAMPLE_SENTENCES': column['EXAMPLE_SENTENCES']
        complete_format.append(item)
    for column in data_list_2:
        item = 'id': column['id'], 'word': column['word'], 'pronunciation': column['pronunciation'],
                'meaning': column['meaning'], 'EXAMPLE_SENTENCES': column['EXAMPLE_SENTENCES']
        complete_format.append(item)
    return complete_format


def fun1(file_path_: str):
    data_list = read_from_tsv(file_path_, ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES'])
    cnt = 0
    for i in data_list:
        print(i)
        cnt += 1
        if cnt == 100:
            break


def fun2(file_path_origin: str, file_path_new_1: str, file_path_new_2: str):
    '''
    file_path_origin = './TSV_data/origin/unduplicated_word_pronounciations_meanings_example_sentences.tsv'
    file_path_new_1 = './TSV_data/1_yes.tsv'
    file_path_new_2 = './TSV_data/2_no.tsv'
    '''
    time_start = time.time()
    extract_line_with_pronunciation(file_path_origin, file_path_new_1, file_path_new_2)
    time_end = time.time()
    time_sum = time_end - time_start
    print('Time used:\\n' + str(time_sum) + ' seconds.')


def fun3(file_path: str, output_path: str, column_names_1: list, column_names_2: list):
    '''
    file_path = './TSV_data/1_yes.tsv'
    output_path = './TSV_data/partly_completed.tsv'
    column_names_1 = ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES']
    column_names_2 = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_sentence']
    '''
    complete_format = split_pronunciation_line(file_path, column_names_1)
    write_to_tsv(output_path, column_names_2, complete_format)


def fun4(file_path: str, output_path: str, column_names_1: list, column_names_2: list):
    '''
    file_path = './TSV_data/2_no.tsv'
    output_path = './TSV_data/partly_completed_no.tsv'
    column_names_1 = ['WORD_ID', 'SINGLE_WORD', 'WORD_MEANINGS', 'EXAMPLE_SENTENCES']
    column_names_2 = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_sentences']
    '''
    complete_format = split_pronunciation_line_no_pronunciation(file_path, column_names_1)
    write_to_tsv(output_path, column_names_2, complete_format)


def fun5(output_path_combine: str, file_path_1: str, file_path_2: str, column_names: list):
    '''
    output_path_combine = './TSV_data/combined_version.tsv'
    file_path_yes = './TSV_data/partly_completed_yes.tsv'
    file_path_no = './TSV_data/partly_completed_no.tsv'
    column_names = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']
    '''
    complete_format = combine_tsv_with_same_headers(file_path_1, file_path_2, column_names)

    # # sorted by id firstly and alphabet secondly.
    # complete_format = sorted(complete_format , key=lambda elem: "%06d %s" %
    #         (int(str(elem['id']).replace('\\ufeff', '').replace('"', '')), elem['word']))

    # # sorted by id only.
    complete_format = sorted(complete_format, key=lambda elem: "%06d" %
                               (int(str(elem['id']).replace('\\ufeff', '').replace('"', ''))))

    # # sorted by alphabet only.
    # complete_format = sorted(complete_format, key=lambda elem: "%s" % (elem['word']))
    write_to_tsv(output_path_combine, column_names_2, complete_format)


def fun6(file_path_len_field: str, column_names: list):
    '''
    file_path_len_field = './TSV_data/3_combined_version_sorted_by_consecutive_IDs.tsv'
    column_names = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']
    '''
    complete_format = read_from_tsv(file_path_len_field, column_names)
    maxLen = -1
    filed_name = ''
    maxLen_1 = -1
    maxLen_2 = -1
    maxLen_3 = -1
    maxLen_4 = -1
    maxLen_5 = -1
    for column in complete_format:
        maxLen_1 = max(len(column['id']), maxLen_1)
        maxLen_2 = max(len(column['word']), maxLen_2)
        maxLen_3 = max(len(column['pronunciation']), maxLen_3)
        maxLen_4 = max(len(column['meaning']), maxLen_4)
        maxLen_5 = max(len(column['EXAMPLE_SENTENCES']), maxLen_5)
        for filed in column_names:
            maxLen = max(len(column[filed]), maxLen)
            filed_name = filed
    print(maxLen)
    print(filed_name)
    print('**********')
    print(str(maxLen_1) + ' ' + str(maxLen_2)+ ' ' + str(maxLen_3)+ ' ' + str(maxLen_4)+ ' ' + str(maxLen_5))
    return maxLen


def fun7(file_path_1: str, file_path_2: str, column_names: list):
    '''
    # # origin:
    def fun7(file_path_1: str, file_path_2: str, column_names: list) -> (list, list):
    file_path_1 = './TSV_data/3_combined_version_sorted_by_consecutive_IDs.tsv'
    file_path_2 = './TSV_data/word.csv'
    column_names = ['id', 'word', 'pronunciation', 'meaning', 'EXAMPLE_SENTENCES']
    '''
    complete_format = read_from_tsv(file_path_1, column_names)
    word_list_1 = []
    id_list = []
    for column in complete_format:
        id_list.append(column['id'])
        word_list_1.append(column['word'])
    file_word_data = open(file_path_2, 'r', encoding='utf-8')
    word_data = file_word_data.readline()
    word_list_2 = []
    while word_data:
        word_list_2.append(word_data.strip('\\n'))
        word_data = file_word_data.readline()

    word_list_1 = sorted(word_list_1)
    word_list_2 = sorted(word_list_2)
    print(word_list_1[0:90])
    print(word_list_2[0:90])
    print(operator.eq(word_list_1, word_list_2))
    print(str(len(word_list_1)) + ' - ' + str(len(word_list_2)))
    for word in word_list_1:
        if word not in word_list_2:
            print('Found. -- ' + word)
            break
    print('*************************************')
    id_list[0] = 1
    id_list[8] = 9
    id_list = [int(j) for j in id_list]
    # cnt = 0
    # for i in id_list:
    #     id_list[cnt] = int(i)
    #     cnt += 1
    print(id_list[0:20])
    print(len(id_list))
    discrete_id_list = []
    for i in range(1, len(id_list)):
        if id_list[i - 1] != id_list[i] - 1:
            # print(id_list[i])
            discrete_id_list.append(id_list[i] - 1)

    print(len(discrete_id_list))
    print(discrete_id_list)
    print('*************************************')
    # # origin:
    '''return word_list_1, word_list_2'''


def fun8(file_path: str):
    '''
    file_path = './TSV_data/id.csv'
    '''
    file_id_data = open(file_path, 'r', encoding='utf-8')
    id_data = file_id_data.readline()
    id_list = []
    while id_data:
        id_list.append(int(id_data))
        id_data = file_id_data.readline()

    print(len(id_list))
    discrete_id_list = []
    for i in range(1, len(id_list)):
        if id_list[i - 1] != id_list[i] - 1:
            discrete_id_list.append(id_list[i] - 1)
    print(len(discrete_id_list))
    print(discrete_id_list)


def fun9札记Python处理TSV文件以及144790个英语单词的注音释义例句的.sql和.tsv文件下载

札记Python处理TSV文件以及144790个英语单词的注音释义例句的.sql和.tsv文件下载

python 模块化 处理大文件 tsv csv

使用 python 在大型数据集中快速查找

使用 python 将 tsv 文件转换为 xls/xlsx

.Tsv . Csv 文件