getting closer, added a FIXME to one of the fields. Having issues with columns in description fields
This commit is contained in:
parent
5dde3be536
commit
40fcbdc8b8
3 changed files with 36 additions and 7 deletions
|
@ -177,8 +177,13 @@ class RecordBuilder(object):
|
|||
for entry in entries:
|
||||
|
||||
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
|
||||
|
||||
try:
|
||||
f_length = int(f_length)
|
||||
except ValueError, e:
|
||||
# bad result, skip
|
||||
continue
|
||||
|
||||
f_length = int(f_length)
|
||||
try:
|
||||
assert f_length == RangeToken(f_range).value
|
||||
except AssertionError:
|
||||
|
|
|
@ -13,7 +13,7 @@ class PDFRecordFinder(object):
|
|||
if not heading_exp:
|
||||
heading_exp = re.compile('\s+Record Name: (.*)')
|
||||
|
||||
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
|
||||
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
||||
pdftext = subprocess.check_output(opts)
|
||||
self.textrows = pdftext.split('\n')
|
||||
self.heading_exp = heading_exp
|
||||
|
@ -31,9 +31,18 @@ class PDFRecordFinder(object):
|
|||
for (i, row) in enumerate(self.textrows):
|
||||
match = self.heading_exp.match(row)
|
||||
if match:
|
||||
if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||
results.append((i, ''.join(match.groups())))
|
||||
print i,match.groups()
|
||||
#if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||
results.append((i, ''.join(match.groups())))
|
||||
|
||||
"""
|
||||
results2 = []
|
||||
for r in results:
|
||||
if len(results2)==0 or results2[-1:][0][1] != r[1]:
|
||||
results2.append(r)
|
||||
results = results2
|
||||
"""
|
||||
|
||||
merged = []
|
||||
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
||||
merged.append( (a[0], b[0]-1, a[1]) )
|
||||
|
@ -50,6 +59,9 @@ class PDFRecordFinder(object):
|
|||
continue
|
||||
|
||||
if cc.is_next_field(row):
|
||||
if row[1][1] == 'Vendor Indicator':
|
||||
import pdb
|
||||
pdb.set_trace()
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
|
||||
|
@ -123,6 +135,13 @@ class ColumnCollector(object):
|
|||
|
||||
else:
|
||||
# try adding a wiggle room value?
|
||||
# FIXME:
|
||||
# Sometimes description columns contain column-like
|
||||
# layouts, and this causes the ColumnCollector to become
|
||||
# confused. Perhaps we could check to see if a column occurs
|
||||
# after the maximum column, and assume it's part of the
|
||||
# max column?
|
||||
|
||||
raise UnknownColumn
|
||||
|
||||
def is_next_field(self, data):
|
||||
|
|
|
@ -4,6 +4,7 @@ from pyaccuwage.pdfextract import PDFRecordFinder
|
|||
import argparse
|
||||
import sys
|
||||
import os
|
||||
import re
|
||||
|
||||
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
|
||||
parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
|
||||
|
@ -30,8 +31,12 @@ source_file = os.path.abspath(args.input[0].name)
|
|||
doc = PDFRecordFinder(source_file)
|
||||
records = doc.records()
|
||||
builder = RecordBuilder()
|
||||
|
||||
for rec in records:
|
||||
print 'name', rec[0]
|
||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
||||
print field
|
||||
|
||||
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
||||
|
||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
||||
sys.stdout.write('\t' + field + '\n')
|
||||
#print field
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue