getting closer, added a FIXME to one of the fields. Having issues with columns in description fields

This commit is contained in:
Binh 2012-07-17 15:44:28 -05:00
parent 5dde3be536
commit 40fcbdc8b8
3 changed files with 36 additions and 7 deletions

View file

@ -178,7 +178,12 @@ class RecordBuilder(object):
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry)) (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
f_length = int(f_length) try:
f_length = int(f_length)
except ValueError, e:
# bad result, skip
continue
try: try:
assert f_length == RangeToken(f_range).value assert f_length == RangeToken(f_range).value
except AssertionError: except AssertionError:

View file

@ -13,7 +13,7 @@ class PDFRecordFinder(object):
if not heading_exp: if not heading_exp:
heading_exp = re.compile('\s+Record Name: (.*)') heading_exp = re.compile('\s+Record Name: (.*)')
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-'] opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
pdftext = subprocess.check_output(opts) pdftext = subprocess.check_output(opts)
self.textrows = pdftext.split('\n') self.textrows = pdftext.split('\n')
self.heading_exp = heading_exp self.heading_exp = heading_exp
@ -31,8 +31,17 @@ class PDFRecordFinder(object):
for (i, row) in enumerate(self.textrows): for (i, row) in enumerate(self.textrows):
match = self.heading_exp.match(row) match = self.heading_exp.match(row)
if match: if match:
if not ''.join(match.groups()).lower().endswith('(continued)'): print i,match.groups()
results.append((i, ''.join(match.groups()))) #if not ''.join(match.groups()).lower().endswith('(continued)'):
results.append((i, ''.join(match.groups())))
"""
results2 = []
for r in results:
if len(results2)==0 or results2[-1:][0][1] != r[1]:
results2.append(r)
results = results2
"""
merged = [] merged = []
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]): for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
@ -50,6 +59,9 @@ class PDFRecordFinder(object):
continue continue
if cc.is_next_field(row): if cc.is_next_field(row):
if row[1][1] == 'Vendor Indicator':
import pdb
pdb.set_trace()
yield cc yield cc
cc = ColumnCollector() cc = ColumnCollector()
@ -123,6 +135,13 @@ class ColumnCollector(object):
else: else:
# try adding a wiggle room value? # try adding a wiggle room value?
# FIXME:
# Sometimes description columns contain column-like
# layouts, and this causes the ColumnCollector to become
# confused. Perhaps we could check to see if a column occurs
# after the maximum column, and assume it's part of the
# max column?
raise UnknownColumn raise UnknownColumn
def is_next_field(self, data): def is_next_field(self, data):

View file

@ -4,6 +4,7 @@ from pyaccuwage.pdfextract import PDFRecordFinder
import argparse import argparse
import sys import sys
import os import os
import re
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.") parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf") parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
@ -30,8 +31,12 @@ source_file = os.path.abspath(args.input[0].name)
doc = PDFRecordFinder(source_file) doc = PDFRecordFinder(source_file)
records = doc.records() records = doc.records()
builder = RecordBuilder() builder = RecordBuilder()
for rec in records:
print 'name', rec[0] for rec in records:
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
print field sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
sys.stdout.write('\t' + field + '\n')
#print field