76 lines
2.3 KiB
Python
Executable file
76 lines
2.3 KiB
Python
Executable file
#!/usr/bin/python
|
|
from pyaccuwage.parser import RecordBuilder
|
|
from pyaccuwage.pdfextract import PDFRecordFinder
|
|
import argparse
|
|
import sys
|
|
import os
|
|
import re
|
|
|
|
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
|
|
parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
|
|
parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true")
|
|
|
|
args = parser.parse_args()
|
|
|
|
def generate_imports():
|
|
return "\n".join([
|
|
"from pyaccuwage import model as pyaccuwagemodel",
|
|
"from pyaccuwage.fields import *",
|
|
"",
|
|
"",
|
|
])
|
|
|
|
if args.full:
|
|
sys.stdout.write(generate_imports())
|
|
|
|
source_file = os.path.abspath(args.input[0].name)
|
|
|
|
doc = PDFRecordFinder(source_file)
|
|
records = doc.records()
|
|
builder = RecordBuilder()
|
|
|
|
def record_begins_at(field):
|
|
return int(fields[0].data.values()[0].split('-')[0], 10)
|
|
|
|
def record_ends_at(fields):
|
|
return int(fields[-1].data.values()[0].split('-')[-1], 10)
|
|
|
|
last_record_begins_at = -1
|
|
last_record_ends_at = -1
|
|
|
|
for rec in records:
|
|
#if not rec[1]:
|
|
# continue # no actual fields detected
|
|
fields = rec[1]
|
|
|
|
# strip out fields that are not 4 items long
|
|
fields = filter(lambda x:len(x.tuple) == 4, fields)
|
|
|
|
# strip fields that don't begin at position 0
|
|
fields = filter(lambda x: 0 in x.data, fields)
|
|
|
|
# strip fields that don't have a length-range type item in position 0
|
|
fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields)
|
|
|
|
if not fields:
|
|
continue
|
|
|
|
begins_at = record_begins_at(fields)
|
|
ends_at = record_ends_at(fields)
|
|
|
|
# FIXME record_ends_at is randomly exploding due to record data being
|
|
# a lump of text and not necessarily a field entry. I assume
|
|
# this is cleaned out by the record builder class.
|
|
|
|
#print last_record_ends_at + 1, begins_at
|
|
if last_record_ends_at + 1 != begins_at:
|
|
name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1])
|
|
name = re.sub('[^\w]*', '', name)
|
|
sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name)
|
|
|
|
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
|
|
sys.stdout.write('\t' + field + '\n')
|
|
#print field
|
|
|
|
last_record_ends_at = ends_at
|
|
|