update for 2023 p1220 parsing, stupid irs

2024-03-29 10:46:01 -04:00 · 2024-03-29 10:46:01 -04:00 · 66573e4d1d
commit 66573e4d1d
parent 86f8861da1
3 changed files with 97 additions and 346 deletions
--- a/scripts/pyaccuwage-pdfparse
+++ b/scripts/pyaccuwage-pdfparse
@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 from pyaccuwage.parser import RecordBuilder
 from pyaccuwage.pdfextract import PDFRecordFinder
 import argparse
@ -29,48 +29,9 @@ doc = PDFRecordFinder(source_file)
 records = doc.records()
 builder = RecordBuilder()

-def record_begins_at(field):
-    return int(fields[0].data.values()[0].split('-')[0], 10)
-
-def record_ends_at(fields):
-    return int(fields[-1].data.values()[0].split('-')[-1], 10)
-
-last_record_begins_at = -1
-last_record_ends_at = -1
-
-for rec in records:
-    #if not rec[1]:
-    #    continue # no actual fields detected
-    fields = rec[1]
-
-    # strip out fields that are not 4 items long
-    fields = filter(lambda x:len(x.tuple) == 4, fields)
-
-    # strip fields that don't begin at position 0
-    fields = filter(lambda x: 0 in x.data, fields)
-
-    # strip fields that don't have a length-range type item in position 0
-    fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
-
-    if not fields:
-        continue
-
-    begins_at = record_begins_at(fields)
-    ends_at = record_ends_at(fields)
-
-    # FIXME record_ends_at is randomly exploding due to record data being
-    # a lump of text and not necessarily a field entry. I assume
-    # this is cleaned out by the record builder class.
-
-    #print last_record_ends_at + 1, begins_at
-    if last_record_ends_at + 1 != begins_at:
-        name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1])
-        name = re.sub('[^\w]*', '', name)
-        sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name)
-
-    for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
+for (name, fields) in records:
+    name = re.sub(r'^[^a-zA-Z]*','', name.split(':')[-1])
+    name = re.sub(r'[^\w]*', '', name)
+    sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name)
+    for field in builder.load(map(lambda x: x, fields[0:])):
        sys.stdout.write('\t' + field + '\n')
-        #print field
-
-    last_record_ends_at = ends_at
-