adding pdfextract for column extraction
This commit is contained in:
parent
770aeb0d2b
commit
fecd14db59
1 changed files with 69 additions and 0 deletions
69
pyaccuwage/pdfextract.py
Normal file
69
pyaccuwage/pdfextract.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
#!/usr/bin/python
|
||||
# coding=UTF-8
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
import pdb
|
||||
|
||||
""" pdftotext -layout -nopgbrk p1220.pdf - """
|
||||
|
||||
class PDFRecordFinder(object):
|
||||
def __init__(self, src, heading_exp=None):
|
||||
if not heading_exp:
|
||||
heading_exp = re.compile('\s+Record Name: (.*)')
|
||||
|
||||
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
|
||||
pdftext = subprocess.check_output(opts)
|
||||
self.textrows = pdftext.split('\n')
|
||||
self.heading_exp = heading_exp
|
||||
|
||||
@property
|
||||
def rows(self):
|
||||
for row in self.textrows:
|
||||
yield row
|
||||
|
||||
@property
|
||||
def records(self):
|
||||
row_iter = self.rows
|
||||
for r in row_iter:
|
||||
if self.heading_exp.match(r):
|
||||
record = self.extract_record(row_iter)
|
||||
yield record
|
||||
|
||||
def extract_record(self, row_iter):
|
||||
re_multiwhite = re.compile(r'\s{2,}')
|
||||
result = []
|
||||
full_width_text_count = 0
|
||||
|
||||
for r in row_iter:
|
||||
row = r.decode('UTF-8')
|
||||
if not row:
|
||||
continue
|
||||
|
||||
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
||||
if not re_multiwhite.search(row):
|
||||
full_width_text_count += 1
|
||||
#if full_width_text_count > 2:
|
||||
# print 'full width text count exceeded limit'
|
||||
# return result
|
||||
continue
|
||||
|
||||
#pdb.set_trace()
|
||||
row_result = []
|
||||
pos = 0
|
||||
while pos < len(row):
|
||||
match = re_multiwhite.search(row[pos:])
|
||||
if match and match.start() > 0:
|
||||
row_result.append((
|
||||
pos,
|
||||
row[pos:pos+match.start()],))
|
||||
pos += match.end()
|
||||
else:
|
||||
if match:
|
||||
pos += match.end()
|
||||
row_result.append((pos,row[pos:],'b'))
|
||||
pos += len(row)
|
||||
|
||||
result.append(row_result)
|
||||
row_result = []
|
||||
return result
|
Loading…
Add table
Add a link
Reference in a new issue