python通过docx模块解决doc及docx后缀文件内容的处理

Posted 2021-01-06

tags:

篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python通过docx模块解决doc及docx后缀文件内容的处理相关的知识，希望对你有一定的参考价值。

import os,shutil,docx,re,time
from win32com import client as wc
#从所有级联目录读取文件到指定目录内
def count_files(file_dir):
    count=0
    for p,d,f in os.walk(file_dir):
        for c in f:
            if c.split(‘.‘)[-1]=="doc":
                count +=1
                src_dir = os.path.join(p, c)
                print(src_dir)
                dst_dir = file_dir + "back"
                if not os.path.exists(dst_dir):
                    os.makedirs(dst_dir)
                shutil.copy(src_dir, dst_dir)
    return count
#提取每个docx简历文档里面的邮箱地址,我们这里使用python-docx模块来解决pip install python-docx
def count_mail(file_dir,dst_file):
    mail_list = []
    for parent,dirctiory,files in os.walk(file_dir):
        for f in files:
            doc = docx.Document(os.path.join(parent,f))
            pattern = re.compile(r‘‘‘([a-zA-Z0-9._%+-][email protected][a-zA-Z0-9	s.-]+(.[a-zA-Z0-9	s]{2,4}))‘‘‘, re.VERBOSE)
            for para in doc.paragraphs:
                for groups in pattern.findall(para.text):
                    mail_list.append(groups[0].replace(" ","")+";")
    with open(dst_file,‘w‘)as f:
        f.writelines(mail_list)
    print("=====================邮件信息写入成功===================")
#由于python-docx模块只能处理docx后缀，我们需要处理doc后缀的文件，必须通过win32com模块来把doc后缀转换成docx
def docxTodoc(old_doc,new_doc):
    word = wc.Dispatch(‘Word.Application‘)
    for parent,directory,files in os.walk(old_doc):
        for f in files:
            doc = word.Documents.Open(os.path.join(parent,f))  # 目标路径下的文件
            new_filepath=os.path.join(new_doc,f.split(".")[0]+".docx")
            print(new_filepath)
            doc.SaveAs(new_filepath, 12, False, "", True, "", False, False, False,False)  # 转化后路径下的文件
            doc.Close()
            print(time.time())
    word.Quit()

if __name__ == ‘__main__‘:
    print(count_files(r"C:UsersicestickDesktop51job_导出简历_20180917"))
    count_mail(r"C:UsersicestickDesktop
ew_doc",r"C:UsersicestickDesktop	est.txt" )
    old_doc = r"C:UsersicestickDesktop51job_导出简历_20180917"  #需要把doc目录转成docx格式的原目录
    new_doc = r"C:UsersicestickDesktop
ew_doc"                  #需要把doc目录转成docx格式的目标目录
    mail_extract = r"C:UsersicestickDesktop	est.txt"            #邮箱提取好的文件
    if not os.path.exists(new_doc):
        os.mkdir(new_doc)
        print("=====================目录创建成功======================")
        docxTodoc(old_doc, new_doc)
        print("=====================docx格式转换成功===================")
        count_mail(new_doc, mail_extract)

    else:
        docxTodoc(old_doc, new_doc)
        print("=====================docx格式转换成功===================")
        count_mail(new_doc, mail_extract)

以上是关于python通过docx模块解决doc及docx后缀文件内容的处理的主要内容，如果未能解决你的问题，请参考以下文章

python模块将doc/pdf/docx/rtf格式转换为文本[重复]

Python：读取 .doc.docx 两种 Word 文件简述及“Word 未能引发事件”错误

用python实现批量替换.doc文件文件内容

python 操作 word 文档，使用 python-docx 操作 word docx 文档

如何利用python_docx提取文字和文字模块儿？

利用python将docx文件转为txt