getting closer, added a FIXME to one of the fields. Having issues with columns in description fields

2012-07-17 15:44:28 -05:00 · 2012-07-17 15:44:28 -05:00 · 40fcbdc8b8
commit 40fcbdc8b8
parent 5dde3be536
3 changed files with 36 additions and 7 deletions
--- a/pyaccuwage/parser.py
+++ b/pyaccuwage/parser.py
@ -177,8 +177,13 @@ class RecordBuilder(object):
        for entry in entries:
            
            (f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
+            
+            try:
+                f_length = int(f_length)
+            except ValueError, e:
+                # bad result, skip
+                continue

-            f_length = int(f_length)
            try:
                assert f_length == RangeToken(f_range).value
            except AssertionError:
--- a/pyaccuwage/pdfextract.py
+++ b/pyaccuwage/pdfextract.py
@ -13,7 +13,7 @@ class PDFRecordFinder(object):
        if not heading_exp:
            heading_exp = re.compile('\s+Record Name: (.*)')

-        opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
+        opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
        pdftext = subprocess.check_output(opts)
        self.textrows = pdftext.split('\n')
        self.heading_exp = heading_exp
@ -31,9 +31,18 @@ class PDFRecordFinder(object):
        for (i, row) in enumerate(self.textrows):
            match = self.heading_exp.match(row)
            if match:
-                if not ''.join(match.groups()).lower().endswith('(continued)'):
-                    results.append((i, ''.join(match.groups())))
+                print i,match.groups()
+                #if not ''.join(match.groups()).lower().endswith('(continued)'):
+                results.append((i, ''.join(match.groups())))
        
+        """
+        results2 = []
+        for r in results:
+            if len(results2)==0 or results2[-1:][0][1] != r[1]:
+                results2.append(r)
+        results = results2
+        """
+
        merged = []
        for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
            merged.append( (a[0], b[0]-1, a[1]) )
@ -50,6 +59,9 @@ class PDFRecordFinder(object):
                continue
            
            if cc.is_next_field(row):
+                if row[1][1] == 'Vendor Indicator':
+                    import pdb
+                    pdb.set_trace()
                yield cc
                cc = ColumnCollector()
            
@ -123,6 +135,13 @@ class ColumnCollector(object):
            
        else:        
            # try adding a wiggle room value?
+            # FIXME:
+            # Sometimes description columns contain column-like
+            # layouts, and this causes the ColumnCollector to become
+            # confused. Perhaps we could check to see if a column occurs
+            # after the maximum column, and assume it's part of the
+            # max column?
+
            raise UnknownColumn
    
    def is_next_field(self, data):
--- a/scripts/pyaccuwage-pdfparse
+++ b/scripts/pyaccuwage-pdfparse
@ -4,6 +4,7 @@ from pyaccuwage.pdfextract import PDFRecordFinder
 import argparse
 import sys
 import os
+import re

 parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
 parser.add_argument("-i", "--input", nargs=1, required=True,  metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
@ -30,8 +31,12 @@ source_file = os.path.abspath(args.input[0].name)
 doc = PDFRecordFinder(source_file)
 records = doc.records()
 builder = RecordBuilder()
+
 for rec in records:
-    print 'name', rec[0]
-    for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
-        print field
+    
+    sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
+    
+    for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
+        sys.stdout.write('\t' + field + '\n')
+        #print field