are overlapping. I'm assuming this is due to missing a continue or something inside the ColumnCollector. I added a couple new IsNextRecord exceptions in response to blank rows, but this may be causing more problems than expected. Next step is probably to check the records returned, and verify that nothing is being duplicated. Some of the duplicates may be filtered out by the RecordBuilder class, or during the fields filtering in the pyaccuwage-pdfparse script (see: fields).
77 lines
2.3 KiB
Python
Executable file
77 lines
2.3 KiB
Python
Executable file
#!/usr/bin/python
|
|
from pyaccuwage.parser import RecordBuilder
|
|
from pyaccuwage.pdfextract import PDFRecordFinder
|
|
import argparse
|
|
import sys
|
|
import os
|
|
import re
|
|
|
|
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
|
|
parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
|
|
parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true")
|
|
|
|
args = parser.parse_args()
|
|
|
|
def generate_imports():
|
|
return "\n".join([
|
|
"from pyaccuwage import model",
|
|
"from pyaccuwage.fields import *",
|
|
"",
|
|
"",
|
|
])
|
|
|
|
def generate_class_begin(name):
|
|
return "class %s(mode.Model):\n" % name
|
|
|
|
if args.full:
|
|
sys.stdout.write(generate_imports())
|
|
|
|
source_file = os.path.abspath(args.input[0].name)
|
|
|
|
doc = PDFRecordFinder(source_file)
|
|
records = doc.records()
|
|
builder = RecordBuilder()
|
|
|
|
def record_begins_at(field):
|
|
return int(fields[0].data.values()[0].split('-')[0], 10)
|
|
|
|
def record_ends_at(fields):
|
|
return int(fields[-1].data.values()[0].split('-')[-1], 10)
|
|
|
|
last_record_begins_at = -1
|
|
last_record_ends_at = -1
|
|
|
|
for rec in records:
|
|
#if not rec[1]:
|
|
# continue # no actual fields detected
|
|
fields = rec[1]
|
|
|
|
# strip out fields that are not 4 items long
|
|
fields = filter(lambda x:len(x.tuple) == 4, fields)
|
|
|
|
# strip fields that don't begin at position 0
|
|
fields = filter(lambda x: 0 in x.data, fields)
|
|
|
|
# strip fields that don't have a length-range type item in position 0
|
|
fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
|
|
|
|
if not fields:
|
|
continue
|
|
|
|
begins_at = record_begins_at(fields)
|
|
ends_at = record_ends_at(fields)
|
|
|
|
# FIXME record_ends_at is randomly exploding due to record data being
|
|
# a lump of text and not necessarily a field entry. I assume
|
|
# this is cleaned out by the record builder class.
|
|
|
|
print last_record_ends_at + 1, begins_at
|
|
#if last_record_ends_at + 1 != begins_at:
|
|
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
|
|
|
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
|
sys.stdout.write('\t' + field + '\n')
|
|
#print field
|
|
|
|
last_record_ends_at = ends_at
|
|
|