adding new pdf extract capability

This commit is contained in:
Binh 2012-07-10 15:24:13 -05:00
parent b77b80e485
commit e8145c5616
4 changed files with 174 additions and 134 deletions

View file

@ -96,6 +96,8 @@ class RangeToken(BaseToken):
@property @property
def value(self): def value(self):
if '-' not in self._value:
return int(self._value)
return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1 return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
@property @property
@ -115,8 +117,7 @@ class NumericToken(BaseToken):
return int(self._value) return int(self._value)
class RecordBuilder(object):
class PastedDefParser(object):
import fields import fields
TOKEN_TYPES = [ TOKEN_TYPES = [
@ -163,6 +164,102 @@ class PastedDefParser(object):
}) })
] ]
def load(self, entries):
# EXPECTS ENTRIES TO BE IN THE FORM OF
# [('1-5', 'Field Name', '5', 'Description of field.'),...]
entries = self._compile(entries)
entries = self._guess_field_types(entries)
entries = self._convert_to_records(entries)
return entries
def _compile(self, entries):
for (f_range, f_name, f_length, f_desc) in entries:
f_length = int(f_length)
try:
assert f_length == RangeToken(f_range).value
except AssertionError:
import pdb
pdb.set_trace()
name_parts = f_name.split(' ')
if name_parts[-1].lower() == '(optional)':
name_parts = name_parts[0:-1]
required = False
elif re.search('required', f_desc, re.IGNORECASE):
required = True
else:
required = None
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
f_name = re.sub(r'[^\w]','', f_name)
yield {
'name': f_name,
'desc': '(' + f_range + '). ' + f_desc,
'length': f_length,
'required': required,
}
def _guess_field_types(self, entries):
lengthexp = LengthExpression()
for entry in entries:
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
for (classtype, criteria) in self.FIELD_TYPES:
if 'length' in criteria:
if not lengthexp(int(entry['length']), criteria['length']):
continue
if 'regexp' in criteria:
for crit_key, crit_values in criteria['regexp'].items():
for crit_re in crit_values:
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
matches = list(matches.items())
matches.sort(key=lambda x:x[1])
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
yield entry
def _convert_to_records(self, entries):
blank_count = 1
for entry in entries:
result = []
add = result.append
# FIELD NAME
if entry['name'] == 'blank':
add( (u'blank%d' % blank_count).ljust(40) )
blank_count += 1
else:
add(entry['name'].ljust(40))
add(' = ')
if entry['guessed_type']:
add(entry['guessed_type'].__name__)
args = []
args.append("max_length=%d" % entry['length'])
if entry['required'] != None:
args.append("required=%s" % ('True' if entry['required'] else 'False'))
add("(" + ", ".join(args) + ")")
yield "".join(result)
class PastedDefParser(RecordBuilder):
def load(self, infile): def load(self, infile):
tokens = self._tokenize(infile) tokens = self._tokenize(infile)
entries = self._parse(tokens) entries = self._parse(tokens)
@ -278,6 +375,7 @@ class PastedDefParser(object):
}) })
"""
def _guess_field_types(self, entries): def _guess_field_types(self, entries):
lengthexp = LengthExpression() lengthexp = LengthExpression()
@ -303,7 +401,8 @@ class PastedDefParser(object):
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
yield entry yield entry
"""
"""
def _convert_to_records(self, entries): def _convert_to_records(self, entries):
blank_count = 1 blank_count = 1
for entry in entries: for entry in entries:
@ -332,4 +431,4 @@ class PastedDefParser(object):
yield "".join(result) yield "".join(result)
"""

View file

@ -18,35 +18,12 @@ class PDFRecordFinder(object):
self.textrows = pdftext.split('\n') self.textrows = pdftext.split('\n')
self.heading_exp = heading_exp self.heading_exp = heading_exp
"""
def columns(self):
results = []
cc = ColumnCollector()
for heading, group in self.record_grouping():
print "HEADING", heading
for row in group:
if cc.is_next_field(row):
yield cc
cc = ColumnCollector()
#print row
try:
cc.add(row)
except UnknownColumn, e:
results.append(cc)
cc = ColumnCollector()
def records(self):
headings = self.locate_heading_rows()
def record_grouping(self): for (start, end, name) in headings:
row_iter = iter(self.textrows) name = name.decode('ascii', 'ignore')
i = 0 yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
for row in row_iter:
i += 1
match = self.heading_exp.match(row)
if match:
yield (match.groups(), self.extract_record_columns(row_iter))
"""
def locate_heading_rows(self): def locate_heading_rows(self):
@ -56,45 +33,32 @@ class PDFRecordFinder(object):
if match: if match:
if not ''.join(match.groups()).lower().endswith('(continued)'): if not ''.join(match.groups()).lower().endswith('(continued)'):
results.append((i, ''.join(match.groups()))) results.append((i, ''.join(match.groups())))
return results
merged = []
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
merged.append( (a[0], b[0]-1, a[1]) )
def records2(self): return merged
row_iter = iter(self.textrows)
record = True
while record:
record = self.extract_record(row_iter)
yield record
def extract_record(self, row_iter):
heading = self.find_heading(row_iter)
fields = self.find_fields(row_iter)
return heading, list(fields)
def find_heading(self, row_iter):
for row in row_iter:
heading_match = self.heading_exp.match(row)
if heading_match:
return heading_match.groups()
def find_fields(self, row_iter): def find_fields(self, row_iter):
cc = ColumnCollector() cc = ColumnCollector()
for r in row_iter: for r in row_iter:
row = self.extract_columns_from_row(r) row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row: if not row:
continue continue
if cc.is_next_field(row): if cc.is_next_field(row):
yield cc yield cc
cc = ColumnCollector() cc = ColumnCollector()
try: try:
cc.add(row) cc.add(row)
except UnknownColumn, e: except UnknownColumn, e:
print 'UNKNOWN COLUMN', row
raise StopIteration raise StopIteration
yield cc
def extract_columns_from_row(self, row): def extract_columns_from_row(self, row):
@ -124,7 +88,7 @@ class PDFRecordFinder(object):
end = white_iter.next() end = white_iter.next()
if start != end: if start != end:
row_result.append( row_result.append(
(start, row[start:end]) (start, row[start:end].encode('ascii','ignore'))
) )
except StopIteration: except StopIteration:
@ -133,78 +97,6 @@ class PDFRecordFinder(object):
return row_result return row_result
def extract_row_columns(self, row_iter):
re_multiwhite = re.compile(r'\s{2,}')
full_width_text_count = 0
#for r in row_iter:
row = None
while not row:
row = row_iter.next()
row = row.decode('UTF-8')
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
if not re_multiwhite.search(row):
full_width_text_count += 1
row = None
if True:
white_ranges = [0,]
pos = 0
match = True
while pos < len(row):
match = re_multiwhite.search(row[pos:])
if match:
white_ranges.append(pos + match.start())
white_ranges.append(pos + match.end())
pos += match.end()
else:
white_ranges.append(len(row))
pos = len(row)
row_result = []
white_iter = iter(white_ranges)
while white_iter:
try:
start = white_iter.next()
end = white_iter.next()
if start != end:
row_result.append(
(start, row[start:end])
)
except StopIteration:
white_iter = None
#print white_ranges
return row_result
#yield row_result
#result.append(row_result)
"""
row_result = []
pos = 0
while pos < len(row):
match = re_multiwhite.search(row[pos:])
if match and match.start() > 0:
row_result.append((
pos,
row[pos:pos+match.start()],))
pos += match.end()
else:
if match:
pos += match.end()
row_result.append((pos,row[pos:]))
pos += len(row)
result.append(row_result)
row_result = []
"""
#return result
class UnknownColumn(Exception): class UnknownColumn(Exception):
pass pass
@ -246,6 +138,8 @@ class ColumnCollector(object):
return self.data.keys()[0] == first_key return self.data.keys()[0] == first_key
return False return False
@property
def tuple(self):
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))

47
scripts/pyaccuwage-pdfparse Executable file
View file

@ -0,0 +1,47 @@
#!/usr/bin/python
from pyaccuwage.parser import RecordBuilder
from pyaccuwage.pdfexport import PDFRecordFinder
import argparse
import sys
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true")
parser.add_argument("-i", "--input", metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
args = parser.parse_args()
"""
lines = []
for x in sys.stdin.readlines():
lines.append(x)
pdp = PastedDefParser()
tokens = pdp.load("".join(lines))
"""
def generate_imports():
return "\n".join([
"from pyaccuwage import model",
"from pyaccuwage.fields import *",
"",
"",
])
def generate_class_begin(name):
return "class %s(mode.Model):\n" % name
if args.full:
sys.stdout.write(generate_imports())
if args.classname:
classname = args.classname
else:
classname = "GeneratedRecord"
sys.stdout.write(generate_class_begin(classname))
for x in tokens:
sys.stdout.write('\t' + x + '\n')

View file

@ -1,7 +1,7 @@
from distutils.core import setup from distutils.core import setup
setup(name='pyaccuwage', setup(name='pyaccuwage',
version='0.0.6', version='0.0.7',
packages=['pyaccuwage'], packages=['pyaccuwage'],
scripts=['scripts/pyaccuwage-parse'], scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'],
zip_safe=True, zip_safe=True,
) )