pandas:在pandas中搜索包含关键词的行

Posted jasonzhangxianrong

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了pandas:在pandas中搜索包含关键词的行相关的知识,希望对你有一定的参考价值。

一、代码

# -*- coding: UTF-8 -*-
import json
import pandas as pd

"""获得所有的文本"""
def get_all_text():
    file_path = "../datas/format/primary.json"
    names = []
    roles = []
    texts = []
    with open(file_path, "r", encoding="utf8") as f:
        for data_line in f.readlines():
            json_data = json.loads(data_line)
            file_name = json_data["file_name"]
            file_data = json_data["datas"]
            for k,v in file_data.items():
                names.append(file_name)
                roles.append(k)
                texts.append(v)
    file_out = "../datas/format/all_text.csv"
    dataframe = pd.DataFrame({\'names\': names, \'roles\': roles, "texts": texts})
    dataframe.to_csv(file_out, index=False, sep=\'\\t\')

"""从csv搜索数据"""
def search_text(key):
    file_out = "../datas/classes/" + key + ".csv"
    file_path = "../datas/format/all_text.csv"
    data = pd.read_csv(file_path, sep="\\t")
    da = data[data["texts"].str.contains(key)]
    da.to_csv(file_out, index=False, sep=\'\\t\')

"""提取带有婚字的数据"""
def data_annotate():
    file_in = "../datas/format/primary.json"
    file_out = "../datas/annotate/label.json"
    with open(file_out, "w", encoding="utf8") as fo:
        with open(file_in, "r", encoding="utf8") as f:
            for line in f.readlines():
                item = {}
                label = 0
                json_data = json.loads(line)
                for k,v in json_data["datas"].items():
                    if "" in v:
                        label = 1
                if label == 1:
                    item["name"] = json_data["file_name"]
                    item["label"] = ""
                    item["datas"] = json_data["datas"]
                    fo.write(json.dumps(item, ensure_ascii=False) + "\\n")
    return "success"

"""提取标注过的数据"""
def annotate():
    file_in = "../datas/annotate/label.json"
    file_labeled = "../datas/annotate/labeled.json"
    file_unlabeled = "../datas/annotate/unlabel.json"
    with open(file_in, "r", encoding="utf8") as f_in:
        with open(file_labeled, "w", encoding="utf8") as f_labeled:
            with open(file_unlabeled, "w", encoding="utf8") as f_unlabeled:
                for line in f_in.readlines():
                    json_data = json.loads(line)
                    if json_data["label"]:
                        f_labeled.write(json.dumps(json_data, ensure_ascii=False) + "\\n")
                    else:
                        f_unlabeled.write(json.dumps(json_data, ensure_ascii=False) + "\\n")
    return "success"

def label_to_csv():
    file_path = "../datas/annotate/labeled.json"
    labels = []
    datas = []
    data_dict = []
    with open(file_path, "r", encoding="utf8") as f:
        for data_line in f.readlines():
            json_data = json.loads(data_line)
            _label = json_data["label"]
            _data = "|".join(json_data["datas"].values())
            labels.append(_label)
            datas.append(_data)
            data_dict.append(data_line.replace("\\n", ""))
    file_out = "../datas/annotate/labeled.csv"
    dataframe = pd.DataFrame({\'labels\': labels, \'datas\': datas, "data_dict": data_dict})
    dataframe.to_csv(file_out, index=False, sep=\'\\t\')

"""提取带工作的数据"""
def get_work():
    search_text("工作")

if __name__ == \'__main__\':
    label_to_csv()

 

以上是关于pandas:在pandas中搜索包含关键词的行的主要内容,如果未能解决你的问题,请参考以下文章

pandas如何删除指定行

使用 pandas 选择和附加一些数据

过滤“pandas”中所有不包含字母(alpha)的行

在 pandas 中,如何选择包含 NaN 的行? [复制]

Pandas:从列表中选择包含任何子字符串的行

Pandas 删除列包含 * 的行