getting closer, added a FIXME to one of the fields. Having issues with columns in description fields

2012-07-17 15:44:28 -05:00 · 2012-07-17 15:44:28 -05:00 · 40fcbdc8b8
commit 40fcbdc8b8
parent 5dde3be536
3 changed files with 36 additions and 7 deletions
--- a/pyaccuwage/parser.py
+++ b/pyaccuwage/parser.py
@ -178,7 +178,12 @@ class RecordBuilder(object):
            (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
-            f_length = int(f_length)
+            try:
                f_length = int(f_length)
            except ValueError, e:
                # bad result, skip
                continue
            try:
                assert f_length == RangeToken(f_range).value
            except AssertionError:
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@ -13,7 +13,7 @@ class PDFRecordFinder(object):
        if not heading_exp:
            heading_exp = re.compile('\s+Record Name: (.*)')
-        opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
+        opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
        pdftext = subprocess.check_output(opts)
        self.textrows = pdftext.split('\n')
        self.heading_exp = heading_exp
@ -31,8 +31,17 @@ class PDFRecordFinder(object):
        for (i, row) in enumerate(self.textrows):
            match = self.heading_exp.match(row)
            if match:
-                if not ''.join(match.groups()).lower().endswith('(continued)'):
+                print i,match.groups()
-                    results.append((i, ''.join(match.groups())))
+                #if not ''.join(match.groups()).lower().endswith('(continued)'):
                results.append((i, ''.join(match.groups())))
        """
        results2 = []
        for r in results:
            if len(results2)==0 or results2[-1:][0][1] != r[1]:
                results2.append(r)
        results = results2
        """
        merged = []
        for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
@ -50,6 +59,9 @@ class PDFRecordFinder(object):
                continue
            if cc.is_next_field(row):
                if row[1][1] == 'Vendor Indicator':
                    import pdb
                    pdb.set_trace()
                yield cc
                cc = ColumnCollector()
@ -123,6 +135,13 @@ class ColumnCollector(object):
        else:        
            # try adding a wiggle room value?
            # FIXME:
            # Sometimes description columns contain column-like
            # layouts, and this causes the ColumnCollector to become
            # confused. Perhaps we could check to see if a column occurs
            # after the maximum column, and assume it's part of the
            # max column?
            raise UnknownColumn
    def is_next_field(self, data):
--- a/scripts/pyaccuwage-pdfparse
+++ b/scripts/pyaccuwage-pdfparse
@ -4,6 +4,7 @@ from pyaccuwage.pdfextract import PDFRecordFinder
 import argparse
 import sys
 import os
 import re
 parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
 parser.add_argument("-i", "--input", nargs=1, required=True,  metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
@ -30,8 +31,12 @@ source_file = os.path.abspath(args.input[0].name)
 doc = PDFRecordFinder(source_file)
 records = doc.records()
 builder = RecordBuilder()
-for rec in records:
+
-    print 'name', rec[0]
+for rec in records:
-    for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
+    
-        print field
+    sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
    for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
        sys.stdout.write('\t' + field + '\n')
        #print field