adding pdfextract for column extraction

This commit is contained in:
Binh 2012-06-19 15:37:17 -05:00
parent 770aeb0d2b
commit fecd14db59

69
pyaccuwage/pdfextract.py Normal file
View file

@ -0,0 +1,69 @@
#!/usr/bin/python
# coding=UTF-8
import subprocess
import re
import pdb
""" pdftotext -layout -nopgbrk p1220.pdf - """
class PDFRecordFinder(object):
def __init__(self, src, heading_exp=None):
if not heading_exp:
heading_exp = re.compile('\s+Record Name: (.*)')
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
pdftext = subprocess.check_output(opts)
self.textrows = pdftext.split('\n')
self.heading_exp = heading_exp
@property
def rows(self):
for row in self.textrows:
yield row
@property
def records(self):
row_iter = self.rows
for r in row_iter:
if self.heading_exp.match(r):
record = self.extract_record(row_iter)
yield record
def extract_record(self, row_iter):
re_multiwhite = re.compile(r'\s{2,}')
result = []
full_width_text_count = 0
for r in row_iter:
row = r.decode('UTF-8')
if not row:
continue
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
if not re_multiwhite.search(row):
full_width_text_count += 1
#if full_width_text_count > 2:
# print 'full width text count exceeded limit'
# return result
continue
#pdb.set_trace()
row_result = []
pos = 0
while pos < len(row):
match = re_multiwhite.search(row[pos:])
if match and match.start() > 0:
row_result.append((
pos,
row[pos:pos+match.start()],))
pos += match.end()
else:
if match:
pos += match.end()
row_result.append((pos,row[pos:],'b'))
pos += len(row)
result.append(row_result)
row_result = []
return result