adding new pdf extract capability
This commit is contained in:
parent
b77b80e485
commit
e8145c5616
4 changed files with 174 additions and 134 deletions
|
@ -96,6 +96,8 @@ class RangeToken(BaseToken):
|
|||
|
||||
@property
|
||||
def value(self):
|
||||
if '-' not in self._value:
|
||||
return int(self._value)
|
||||
return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
|
||||
|
||||
@property
|
||||
|
@ -115,8 +117,7 @@ class NumericToken(BaseToken):
|
|||
return int(self._value)
|
||||
|
||||
|
||||
|
||||
class PastedDefParser(object):
|
||||
class RecordBuilder(object):
|
||||
import fields
|
||||
|
||||
TOKEN_TYPES = [
|
||||
|
@ -163,6 +164,102 @@ class PastedDefParser(object):
|
|||
})
|
||||
]
|
||||
|
||||
def load(self, entries):
|
||||
# EXPECTS ENTRIES TO BE IN THE FORM OF
|
||||
# [('1-5', 'Field Name', '5', 'Description of field.'),...]
|
||||
entries = self._compile(entries)
|
||||
entries = self._guess_field_types(entries)
|
||||
entries = self._convert_to_records(entries)
|
||||
return entries
|
||||
|
||||
|
||||
def _compile(self, entries):
|
||||
for (f_range, f_name, f_length, f_desc) in entries:
|
||||
f_length = int(f_length)
|
||||
try:
|
||||
assert f_length == RangeToken(f_range).value
|
||||
except AssertionError:
|
||||
import pdb
|
||||
pdb.set_trace()
|
||||
|
||||
name_parts = f_name.split(' ')
|
||||
|
||||
if name_parts[-1].lower() == '(optional)':
|
||||
name_parts = name_parts[0:-1]
|
||||
required = False
|
||||
elif re.search('required', f_desc, re.IGNORECASE):
|
||||
required = True
|
||||
else:
|
||||
required = None
|
||||
|
||||
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
|
||||
f_name = re.sub(r'[^\w]','', f_name)
|
||||
|
||||
yield {
|
||||
'name': f_name,
|
||||
'desc': '(' + f_range + '). ' + f_desc,
|
||||
'length': f_length,
|
||||
'required': required,
|
||||
}
|
||||
|
||||
|
||||
def _guess_field_types(self, entries):
|
||||
lengthexp = LengthExpression()
|
||||
|
||||
for entry in entries:
|
||||
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||
|
||||
for (classtype, criteria) in self.FIELD_TYPES:
|
||||
if 'length' in criteria:
|
||||
if not lengthexp(int(entry['length']), criteria['length']):
|
||||
continue
|
||||
|
||||
if 'regexp' in criteria:
|
||||
for crit_key, crit_values in criteria['regexp'].items():
|
||||
for crit_re in crit_values:
|
||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||
|
||||
|
||||
matches = list(matches.items())
|
||||
matches.sort(key=lambda x:x[1])
|
||||
|
||||
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
||||
|
||||
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||
yield entry
|
||||
|
||||
def _convert_to_records(self, entries):
|
||||
blank_count = 1
|
||||
for entry in entries:
|
||||
result = []
|
||||
add = result.append
|
||||
|
||||
# FIELD NAME
|
||||
if entry['name'] == 'blank':
|
||||
add( (u'blank%d' % blank_count).ljust(40) )
|
||||
blank_count += 1
|
||||
else:
|
||||
add(entry['name'].ljust(40))
|
||||
|
||||
add(' = ')
|
||||
|
||||
if entry['guessed_type']:
|
||||
add(entry['guessed_type'].__name__)
|
||||
|
||||
args = []
|
||||
args.append("max_length=%d" % entry['length'])
|
||||
if entry['required'] != None:
|
||||
args.append("required=%s" % ('True' if entry['required'] else 'False'))
|
||||
|
||||
add("(" + ", ".join(args) + ")")
|
||||
|
||||
|
||||
yield "".join(result)
|
||||
|
||||
|
||||
|
||||
class PastedDefParser(RecordBuilder):
|
||||
|
||||
def load(self, infile):
|
||||
tokens = self._tokenize(infile)
|
||||
entries = self._parse(tokens)
|
||||
|
@ -278,6 +375,7 @@ class PastedDefParser(object):
|
|||
})
|
||||
|
||||
|
||||
"""
|
||||
def _guess_field_types(self, entries):
|
||||
lengthexp = LengthExpression()
|
||||
|
||||
|
@ -303,7 +401,8 @@ class PastedDefParser(object):
|
|||
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||
yield entry
|
||||
|
||||
|
||||
"""
|
||||
"""
|
||||
def _convert_to_records(self, entries):
|
||||
blank_count = 1
|
||||
for entry in entries:
|
||||
|
@ -332,4 +431,4 @@ class PastedDefParser(object):
|
|||
|
||||
yield "".join(result)
|
||||
|
||||
|
||||
"""
|
||||
|
|
|
@ -18,35 +18,12 @@ class PDFRecordFinder(object):
|
|||
self.textrows = pdftext.split('\n')
|
||||
self.heading_exp = heading_exp
|
||||
|
||||
"""
|
||||
def columns(self):
|
||||
results = []
|
||||
cc = ColumnCollector()
|
||||
for heading, group in self.record_grouping():
|
||||
print "HEADING", heading
|
||||
for row in group:
|
||||
if cc.is_next_field(row):
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
#print row
|
||||
try:
|
||||
cc.add(row)
|
||||
except UnknownColumn, e:
|
||||
results.append(cc)
|
||||
cc = ColumnCollector()
|
||||
|
||||
|
||||
|
||||
def record_grouping(self):
|
||||
row_iter = iter(self.textrows)
|
||||
i = 0
|
||||
for row in row_iter:
|
||||
i += 1
|
||||
match = self.heading_exp.match(row)
|
||||
if match:
|
||||
yield (match.groups(), self.extract_record_columns(row_iter))
|
||||
|
||||
"""
|
||||
def records(self):
|
||||
headings = self.locate_heading_rows()
|
||||
for (start, end, name) in headings:
|
||||
name = name.decode('ascii', 'ignore')
|
||||
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
||||
|
||||
|
||||
def locate_heading_rows(self):
|
||||
|
@ -56,45 +33,32 @@ class PDFRecordFinder(object):
|
|||
if match:
|
||||
if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||
results.append((i, ''.join(match.groups())))
|
||||
return results
|
||||
|
||||
merged = []
|
||||
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
||||
merged.append( (a[0], b[0]-1, a[1]) )
|
||||
|
||||
def records2(self):
|
||||
row_iter = iter(self.textrows)
|
||||
record = True
|
||||
while record:
|
||||
record = self.extract_record(row_iter)
|
||||
yield record
|
||||
|
||||
|
||||
def extract_record(self, row_iter):
|
||||
heading = self.find_heading(row_iter)
|
||||
fields = self.find_fields(row_iter)
|
||||
return heading, list(fields)
|
||||
|
||||
|
||||
def find_heading(self, row_iter):
|
||||
for row in row_iter:
|
||||
heading_match = self.heading_exp.match(row)
|
||||
if heading_match:
|
||||
return heading_match.groups()
|
||||
return merged
|
||||
|
||||
|
||||
def find_fields(self, row_iter):
|
||||
cc = ColumnCollector()
|
||||
for r in row_iter:
|
||||
row = self.extract_columns_from_row(r)
|
||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||
|
||||
if not row:
|
||||
continue
|
||||
|
||||
if cc.is_next_field(row):
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
|
||||
try:
|
||||
cc.add(row)
|
||||
|
||||
except UnknownColumn, e:
|
||||
print 'UNKNOWN COLUMN', row
|
||||
raise StopIteration
|
||||
yield cc
|
||||
|
||||
|
||||
def extract_columns_from_row(self, row):
|
||||
|
@ -124,7 +88,7 @@ class PDFRecordFinder(object):
|
|||
end = white_iter.next()
|
||||
if start != end:
|
||||
row_result.append(
|
||||
(start, row[start:end])
|
||||
(start, row[start:end].encode('ascii','ignore'))
|
||||
)
|
||||
|
||||
except StopIteration:
|
||||
|
@ -133,78 +97,6 @@ class PDFRecordFinder(object):
|
|||
return row_result
|
||||
|
||||
|
||||
|
||||
def extract_row_columns(self, row_iter):
|
||||
re_multiwhite = re.compile(r'\s{2,}')
|
||||
full_width_text_count = 0
|
||||
|
||||
#for r in row_iter:
|
||||
row = None
|
||||
while not row:
|
||||
row = row_iter.next()
|
||||
row = row.decode('UTF-8')
|
||||
|
||||
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
||||
if not re_multiwhite.search(row):
|
||||
full_width_text_count += 1
|
||||
row = None
|
||||
|
||||
if True:
|
||||
white_ranges = [0,]
|
||||
pos = 0
|
||||
match = True
|
||||
while pos < len(row):
|
||||
match = re_multiwhite.search(row[pos:])
|
||||
if match:
|
||||
white_ranges.append(pos + match.start())
|
||||
white_ranges.append(pos + match.end())
|
||||
pos += match.end()
|
||||
else:
|
||||
white_ranges.append(len(row))
|
||||
pos = len(row)
|
||||
|
||||
|
||||
row_result = []
|
||||
white_iter = iter(white_ranges)
|
||||
while white_iter:
|
||||
try:
|
||||
start = white_iter.next()
|
||||
end = white_iter.next()
|
||||
if start != end:
|
||||
row_result.append(
|
||||
(start, row[start:end])
|
||||
)
|
||||
|
||||
except StopIteration:
|
||||
white_iter = None
|
||||
|
||||
#print white_ranges
|
||||
return row_result
|
||||
#yield row_result
|
||||
#result.append(row_result)
|
||||
|
||||
|
||||
"""
|
||||
row_result = []
|
||||
pos = 0
|
||||
while pos < len(row):
|
||||
match = re_multiwhite.search(row[pos:])
|
||||
if match and match.start() > 0:
|
||||
row_result.append((
|
||||
pos,
|
||||
row[pos:pos+match.start()],))
|
||||
pos += match.end()
|
||||
else:
|
||||
if match:
|
||||
pos += match.end()
|
||||
row_result.append((pos,row[pos:]))
|
||||
pos += len(row)
|
||||
result.append(row_result)
|
||||
row_result = []
|
||||
"""
|
||||
#return result
|
||||
|
||||
|
||||
class UnknownColumn(Exception):
|
||||
pass
|
||||
|
||||
|
@ -246,6 +138,8 @@ class ColumnCollector(object):
|
|||
return self.data.keys()[0] == first_key
|
||||
return False
|
||||
|
||||
|
||||
@property
|
||||
def tuple(self):
|
||||
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
||||
|
||||
|
||||
|
|
47
scripts/pyaccuwage-pdfparse
Executable file
47
scripts/pyaccuwage-pdfparse
Executable file
|
@ -0,0 +1,47 @@
|
|||
#!/usr/bin/python
|
||||
from pyaccuwage.parser import RecordBuilder
|
||||
from pyaccuwage.pdfexport import PDFRecordFinder
|
||||
import argparse
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
|
||||
parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true")
|
||||
parser.add_argument("-i", "--input", metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
|
||||
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
"""
|
||||
lines = []
|
||||
for x in sys.stdin.readlines():
|
||||
lines.append(x)
|
||||
|
||||
pdp = PastedDefParser()
|
||||
tokens = pdp.load("".join(lines))
|
||||
"""
|
||||
def generate_imports():
|
||||
return "\n".join([
|
||||
"from pyaccuwage import model",
|
||||
"from pyaccuwage.fields import *",
|
||||
"",
|
||||
"",
|
||||
])
|
||||
|
||||
def generate_class_begin(name):
|
||||
return "class %s(mode.Model):\n" % name
|
||||
|
||||
if args.full:
|
||||
sys.stdout.write(generate_imports())
|
||||
|
||||
if args.classname:
|
||||
classname = args.classname
|
||||
else:
|
||||
classname = "GeneratedRecord"
|
||||
|
||||
sys.stdout.write(generate_class_begin(classname))
|
||||
|
||||
|
||||
for x in tokens:
|
||||
sys.stdout.write('\t' + x + '\n')
|
||||
|
||||
|
4
setup.py
4
setup.py
|
@ -1,7 +1,7 @@
|
|||
from distutils.core import setup
|
||||
setup(name='pyaccuwage',
|
||||
version='0.0.6',
|
||||
version='0.0.7',
|
||||
packages=['pyaccuwage'],
|
||||
scripts=['scripts/pyaccuwage-parse'],
|
||||
scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'],
|
||||
zip_safe=True,
|
||||
)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue