#!/usr/bin/python from pyaccuwage.parser import RecordBuilder from pyaccuwage.pdfextract import PDFRecordFinder import argparse import sys import os import re parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.") parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf") parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true") args = parser.parse_args() def generate_imports(): return "\n".join([ "from pyaccuwage import model", "from pyaccuwage.fields import *", "", "", ]) def generate_class_begin(name): name = re.sub(r"^[\d]*", "", name) print name return "class %s(mode.Model):\n" % name if args.full: sys.stdout.write(generate_imports()) source_file = os.path.abspath(args.input[0].name) doc = PDFRecordFinder(source_file) records = doc.records() builder = RecordBuilder() def record_begins_at(field): return int(fields[0].data.values()[0].split('-')[0], 10) def record_ends_at(fields): return int(fields[-1].data.values()[0].split('-')[-1], 10) last_record_begins_at = -1 last_record_ends_at = -1 for rec in records: #if not rec[1]: # continue # no actual fields detected fields = rec[1] # strip out fields that are not 4 items long fields = filter(lambda x:len(x.tuple) == 4, fields) # strip fields that don't begin at position 0 fields = filter(lambda x: 0 in x.data, fields) # strip fields that don't have a length-range type item in position 0 fields = filter(lambda x: re.match('^\d+[-]?\d*$', x.data[0]), fields) if not fields: continue begins_at = record_begins_at(fields) ends_at = record_ends_at(fields) # FIXME record_ends_at is randomly exploding due to record data being # a lump of text and not necessarily a field entry. I assume # this is cleaned out by the record builder class. #print last_record_ends_at + 1, begins_at if last_record_ends_at + 1 != begins_at: name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1]) name = re.sub('[^\w]*', '', name) sys.stdout.write("\nclass %s(object):\n" % name) for field in builder.load(map(lambda x:x.tuple, rec[1][0:])): sys.stdout.write('\t' + field + '\n') #print field last_record_ends_at = ends_at