getting closer, added a FIXME to one of the fields. Having issues with columns in description fields
This commit is contained in:
parent
5dde3be536
commit
40fcbdc8b8
3 changed files with 36 additions and 7 deletions
|
@ -178,7 +178,12 @@ class RecordBuilder(object):
|
||||||
|
|
||||||
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
|
(f_range, f_name, f_length, f_desc) = list(entry) + ['']*(4-len(entry))
|
||||||
|
|
||||||
f_length = int(f_length)
|
try:
|
||||||
|
f_length = int(f_length)
|
||||||
|
except ValueError, e:
|
||||||
|
# bad result, skip
|
||||||
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
assert f_length == RangeToken(f_range).value
|
assert f_length == RangeToken(f_range).value
|
||||||
except AssertionError:
|
except AssertionError:
|
||||||
|
|
|
@ -13,7 +13,7 @@ class PDFRecordFinder(object):
|
||||||
if not heading_exp:
|
if not heading_exp:
|
||||||
heading_exp = re.compile('\s+Record Name: (.*)')
|
heading_exp = re.compile('\s+Record Name: (.*)')
|
||||||
|
|
||||||
opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
|
opts = ["pdftotext", "-layout", "-nopgbrk", "-eol", "unix", src, '-']
|
||||||
pdftext = subprocess.check_output(opts)
|
pdftext = subprocess.check_output(opts)
|
||||||
self.textrows = pdftext.split('\n')
|
self.textrows = pdftext.split('\n')
|
||||||
self.heading_exp = heading_exp
|
self.heading_exp = heading_exp
|
||||||
|
@ -31,8 +31,17 @@ class PDFRecordFinder(object):
|
||||||
for (i, row) in enumerate(self.textrows):
|
for (i, row) in enumerate(self.textrows):
|
||||||
match = self.heading_exp.match(row)
|
match = self.heading_exp.match(row)
|
||||||
if match:
|
if match:
|
||||||
if not ''.join(match.groups()).lower().endswith('(continued)'):
|
print i,match.groups()
|
||||||
results.append((i, ''.join(match.groups())))
|
#if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||||
|
results.append((i, ''.join(match.groups())))
|
||||||
|
|
||||||
|
"""
|
||||||
|
results2 = []
|
||||||
|
for r in results:
|
||||||
|
if len(results2)==0 or results2[-1:][0][1] != r[1]:
|
||||||
|
results2.append(r)
|
||||||
|
results = results2
|
||||||
|
"""
|
||||||
|
|
||||||
merged = []
|
merged = []
|
||||||
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
||||||
|
@ -50,6 +59,9 @@ class PDFRecordFinder(object):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if cc.is_next_field(row):
|
if cc.is_next_field(row):
|
||||||
|
if row[1][1] == 'Vendor Indicator':
|
||||||
|
import pdb
|
||||||
|
pdb.set_trace()
|
||||||
yield cc
|
yield cc
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
|
|
||||||
|
@ -123,6 +135,13 @@ class ColumnCollector(object):
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# try adding a wiggle room value?
|
# try adding a wiggle room value?
|
||||||
|
# FIXME:
|
||||||
|
# Sometimes description columns contain column-like
|
||||||
|
# layouts, and this causes the ColumnCollector to become
|
||||||
|
# confused. Perhaps we could check to see if a column occurs
|
||||||
|
# after the maximum column, and assume it's part of the
|
||||||
|
# max column?
|
||||||
|
|
||||||
raise UnknownColumn
|
raise UnknownColumn
|
||||||
|
|
||||||
def is_next_field(self, data):
|
def is_next_field(self, data):
|
||||||
|
|
|
@ -4,6 +4,7 @@ from pyaccuwage.pdfextract import PDFRecordFinder
|
||||||
import argparse
|
import argparse
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
|
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
|
||||||
parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
|
parser.add_argument("-i", "--input", nargs=1, required=True, metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
|
||||||
|
@ -30,8 +31,12 @@ source_file = os.path.abspath(args.input[0].name)
|
||||||
doc = PDFRecordFinder(source_file)
|
doc = PDFRecordFinder(source_file)
|
||||||
records = doc.records()
|
records = doc.records()
|
||||||
builder = RecordBuilder()
|
builder = RecordBuilder()
|
||||||
for rec in records:
|
|
||||||
print 'name', rec[0]
|
for rec in records:
|
||||||
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
|
||||||
print field
|
sys.stdout.write("class %s(object):\n" % re.sub('[^\w]','',rec[0]))
|
||||||
|
|
||||||
|
for field in builder.load(map(lambda x:x.tuple, rec[1][1:])):
|
||||||
|
sys.stdout.write('\t' + field + '\n')
|
||||||
|
#print field
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue