From 40fcbdc8b8fb157268817f8db17b85c1df37d547 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 17 Jul 2012 15:44:28 -0500 Subject: [PATCH] getting closer, added a FIXME to one of the fields. Having issues with columns in description fields --- pyaccuwage/parser.py | 7 ++++++- pyaccuwage/pdfextract.py | 25 ++++++++++++++++++++++--- scripts/pyaccuwage-pdfparse | 11 ++++++++--- 3 files changed, 36 insertions(+), 7 deletions(-) diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index 7ca6a37..b06f6d4 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -177,8 +177,13 @@ class RecordBuilder(object): for entry in entries: (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry)) + + try: + f_length = int(f_length) + except ValueError, e: + # bad result, skip + continue - f_length = int(f_length) try: assert f_length == RangeToken(f_range).value except AssertionError: diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index eeba7c5..9b7c64a 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -13,7 +13,7 @@ class PDFRecordFinder(object): if not heading_exp: heading_exp = re.compile('\s+Record Name: (.*)') - opts = ["pdftotext", "-layout", "-nopgbrk", src, '-'] + opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-'] pdftext = subprocess.check_output(opts) self.textrows = pdftext.split('\n') self.heading_exp = heading_exp @@ -31,9 +31,18 @@ class PDFRecordFinder(object): for (i, row) in enumerate(self.textrows): match = self.heading_exp.match(row) if match: - if not ''.join(match.groups()).lower().endswith('(continued)'): - results.append((i, ''.join(match.groups()))) + print i,match.groups() + #if not ''.join(match.groups()).lower().endswith('(continued)'): + results.append((i, ''.join(match.groups()))) + """ + results2 = [] + for r in results: + if len(results2)==0 or results2[-1:][0][1] != r[1]: + results2.append(r) + results = results2 + """ + merged = [] for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]): merged.append( (a[0], b[0]-1, a[1]) ) @@ -50,6 +59,9 @@ class PDFRecordFinder(object): continue if cc.is_next_field(row): + if row[1][1] == 'Vendor Indicator': + import pdb + pdb.set_trace() yield cc cc = ColumnCollector() @@ -123,6 +135,13 @@ class ColumnCollector(object): else: # try adding a wiggle room value? + # FIXME: + # Sometimes description columns contain column-like + # layouts, and this causes the ColumnCollector to become + # confused. Perhaps we could check to see if a column occurs + # after the maximum column, and assume it's part of the + # max column? + raise UnknownColumn def is_next_field(self, data): diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index 615ddcf..f830f86 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -4,6 +4,7 @@ from pyaccuwage.pdfextract import PDFRecordFinder import argparse import sys import os +import re parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.") parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf") @@ -30,8 +31,12 @@ source_file = os.path.abspath(args.input[0].name) doc = PDFRecordFinder(source_file) records = doc.records() builder = RecordBuilder() + for rec in records: - print 'name', rec[0] - for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): - print field + + sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0])) + + for field in builder.load(map(lambda x:x.tuple, rec[1][1:])): + sys.stdout.write('\t' + field + '\n') + #print field