python 简单的PDF表格刮刀的示例Python代码

Posted 2021-05-09
tags:
篇首语：本文由小常识网(cha138.com)小编为大家整理，主要介绍了python 简单的PDF表格刮刀的示例Python代码相关的知识，希望对你有一定的参考价值。

# 1. Add some necessary libraries
import scraperwiki
import urllib2, lxml.etree

# 2. The URL/web address where we can find the PDF we want to scrape
url = 'http://cdn.varner.eu/cdn-1ce36b6442a6146/Global/Varner/CSR/Downloads_CSR/Fabrikklister_VarnerGruppen_2013.pdf'

# 3. Grab the file and convert it to an XML document we can work with
pdfdata = urllib2.urlopen(url).read()
xmldata = scraperwiki.pdftoxml(pdfdata)
root = lxml.etree.fromstring(xmldata)

# 4. Have a peek at the XML (click the "more" link in the Console to preview it).
print lxml.etree.tostring(root, pretty_print=True)

# 5. How many pages in the PDF document?
pages = list(root)
print "There are",len(pages),"pages"

'''
# 6. Iterate through the elements in each page, and preview them
for page in pages:
    for el in page:
        if el.tag == "text":
            print el.text, el.attrib

# REPLACE STEP 6 WITH THE FOLLOWING
# 7. We can use the positioning attibutes in the XML data to help us regenerate the rows and columns
for page in pages:
    for el in page:
        if el.tag == "text":
            if int(el.attrib['left']) < 100: print 'Country:', el.text,
            elif int(el.attrib['left']) < 250: print 'Factory name:', el.text,
            elif int(el.attrib['left']) < 500: print 'Address:', el.text,
            elif int(el.attrib['left']) < 1000: print 'City:', el.text,
            else:
                print 'Region:', el.text

# REPLACE STEP 7 WITH THE FOLLOWING
# 8. Rather than just printing out the data, we can generate and display a data structure representing each row.
#    We can also skip the first page, the title page that doesn't contain any of the tabulated information we're after.
for page in pages[1:]:
    for el in page:
        if el.tag == "text":
            if int(el.attrib['left']) < 100: data = { 'Country': el.text }
            elif int(el.attrib['left']) < 250: data['Factory name'] = el.text
            elif int(el.attrib['left']) < 500: data['Address'] = el.text
            elif int(el.attrib['left']) < 1000: data['City'] = el.text
            else:
                data['Region'] = el.text
                print data

# REPLACE STEP 8 WITH THE FOLLOWING
# 9. This really crude hack ignores data values that correspond to column headers.
#    A more elecgant solution would use ignore elements in the first table row on each page.
skiplist=['COUNTRY','FACTORY NAME','ADDRESS','CITY','REGION']
for page in pages[1:]:
    for el in page:
        if el.tag == "text" and el.text not in skiplist:
            if int(el.attrib['left']) < 100: data = { 'Country': el.text }
            elif int(el.attrib['left']) < 250: data['Factory name'] = el.text
            elif int(el.attrib['left']) < 500: data['Address'] = el.text
            elif int(el.attrib['left']) < 1000: data['City'] = el.text
            else:
                data['Region'] = el.text
                print data

# REPLACE STEP 9 WITH THE FOLLOWING
# 10. A crude way of adding data o the database - write each row as we scrape it.
skiplist=['COUNTRY','FACTORY NAME','ADDRESS','CITY','REGION']
for page in pages[1:]:
    for el in page:
        if el.tag == "text" and el.text not in skiplist:
            if int(el.attrib['left']) < 100: data = { 'Country': el.text }
            elif int(el.attrib['left']) < 250: data['Factory name'] = el.text
            elif int(el.attrib['left']) < 500: data['Address'] = el.text
            elif int(el.attrib['left']) < 1000: data['City'] = el.text
            else:
                data['Region'] = el.text
                print data
                scraperwiki.sqlite.save(unique_keys=[], table_name='fabvarn', data=data)

# REPLACE STEP 10 WITH THE FOLLOWING
# 11. A more efficient way of writing to the database might be to write all the records scraped from a page one page at a time.
skiplist=['COUNTRY','FACTORY NAME','ADDRESS','CITY','REGION']
bigdata=[]
for page in pages[1:]:
    for el in page:
        if el.tag == "text" and el.text not in skiplist:
            if int(el.attrib['left']) < 100: data = { 'Country': el.text }
            elif int(el.attrib['left']) < 250: data['Factory name'] = el.text
            elif int(el.attrib['left']) < 500: data['Address'] = el.text
            elif int(el.attrib['left']) < 1000: data['City'] = el.text
            else:
                data['Region'] = el.text
                print data
                bigdata.append( data.copy() )
    scraperwiki.sqlite.save(unique_keys=[], table_name='fabvarn', data=bigdata)
    bigdata=[]

'''
# REPLACE STEP 11 WITH THE FOLLOWING
# 12. If necessary, and becuase we are unsing incremental rather than repeat keys,
#     we may need to clear the database table before we right to it.
#     A utulity function can help us do that.

def dropper(table):
    if table!='':
        try: scraperwiki.sqlite.execute('drop table "'+table+'"')
        except: pass

dropper('fabvarn')

skiplist=['COUNTRY','FACTORY NAME','ADDRESS','CITY','REGION']
bigdata=[]
for page in pages[1:]:
    for el in page:
        if el.tag == "text" and el.text not in skiplist:
            if int(el.attrib['left']) < 100: data = { 'Country': el.text }
            elif int(el.attrib['left']) < 250: data['Factory name'] = el.text
            elif int(el.attrib['left']) < 500: data['Address'] = el.text
            elif int(el.attrib['left']) < 1000: data['City'] = el.text
            else:
                data['Region'] = el.text
                print data
                bigdata.append( data.copy() )
    scraperwiki.sqlite.save(unique_keys=[], table_name='fabvarn', data=bigdata)
    bigdata=[]'''
以上是关于python 简单的PDF表格刮刀的示例Python代码的主要内容，如果未能解决你的问题，请参考以下文章