update for 2023 p1220 parsing, stupid irs
This commit is contained in:
parent
86f8861da1
commit
66573e4d1d
3 changed files with 97 additions and 346 deletions
|
@ -1,4 +1,4 @@
|
|||
#!/usr/bin/python
|
||||
#!/usr/bin/env python
|
||||
from pyaccuwage.parser import RecordBuilder
|
||||
from pyaccuwage.pdfextract import PDFRecordFinder
|
||||
import argparse
|
||||
|
@ -29,48 +29,9 @@ doc = PDFRecordFinder(source_file)
|
|||
records = doc.records()
|
||||
builder = RecordBuilder()
|
||||
|
||||
def record_begins_at(field):
|
||||
return int(fields[0].data.values()[0].split('-')[0], 10)
|
||||
|
||||
def record_ends_at(fields):
|
||||
return int(fields[-1].data.values()[0].split('-')[-1], 10)
|
||||
|
||||
last_record_begins_at = -1
|
||||
last_record_ends_at = -1
|
||||
|
||||
for rec in records:
|
||||
#if not rec[1]:
|
||||
# continue # no actual fields detected
|
||||
fields = rec[1]
|
||||
|
||||
# strip out fields that are not 4 items long
|
||||
fields = filter(lambda x:len(x.tuple) == 4, fields)
|
||||
|
||||
# strip fields that don't begin at position 0
|
||||
fields = filter(lambda x: 0 in x.data, fields)
|
||||
|
||||
# strip fields that don't have a length-range type item in position 0
|
||||
fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
|
||||
|
||||
if not fields:
|
||||
continue
|
||||
|
||||
begins_at = record_begins_at(fields)
|
||||
ends_at = record_ends_at(fields)
|
||||
|
||||
# FIXME record_ends_at is randomly exploding due to record data being
|
||||
# a lump of text and not necessarily a field entry. I assume
|
||||
# this is cleaned out by the record builder class.
|
||||
|
||||
#print last_record_ends_at + 1, begins_at
|
||||
if last_record_ends_at + 1 != begins_at:
|
||||
name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1])
|
||||
name = re.sub('[^\w]*', '', name)
|
||||
sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name)
|
||||
|
||||
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
|
||||
for (name, fields) in records:
|
||||
name = re.sub(r'^[^a-zA-Z]*','', name.split(':')[-1])
|
||||
name = re.sub(r'[^\w]*', '', name)
|
||||
sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name)
|
||||
for field in builder.load(map(lambda x: x, fields[0:])):
|
||||
sys.stdout.write('\t' + field + '\n')
|
||||
#print field
|
||||
|
||||
last_record_ends_at = ends_at
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue