adding new pdf extract capability
This commit is contained in:
parent
b77b80e485
commit
e8145c5616
4 changed files with 174 additions and 134 deletions
|
@ -96,6 +96,8 @@ class RangeToken(BaseToken):
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def value(self):
|
def value(self):
|
||||||
|
if '-' not in self._value:
|
||||||
|
return int(self._value)
|
||||||
return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
|
return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -115,8 +117,7 @@ class NumericToken(BaseToken):
|
||||||
return int(self._value)
|
return int(self._value)
|
||||||
|
|
||||||
|
|
||||||
|
class RecordBuilder(object):
|
||||||
class PastedDefParser(object):
|
|
||||||
import fields
|
import fields
|
||||||
|
|
||||||
TOKEN_TYPES = [
|
TOKEN_TYPES = [
|
||||||
|
@ -163,6 +164,102 @@ class PastedDefParser(object):
|
||||||
})
|
})
|
||||||
]
|
]
|
||||||
|
|
||||||
|
def load(self, entries):
|
||||||
|
# EXPECTS ENTRIES TO BE IN THE FORM OF
|
||||||
|
# [('1-5', 'Field Name', '5', 'Description of field.'),...]
|
||||||
|
entries = self._compile(entries)
|
||||||
|
entries = self._guess_field_types(entries)
|
||||||
|
entries = self._convert_to_records(entries)
|
||||||
|
return entries
|
||||||
|
|
||||||
|
|
||||||
|
def _compile(self, entries):
|
||||||
|
for (f_range, f_name, f_length, f_desc) in entries:
|
||||||
|
f_length = int(f_length)
|
||||||
|
try:
|
||||||
|
assert f_length == RangeToken(f_range).value
|
||||||
|
except AssertionError:
|
||||||
|
import pdb
|
||||||
|
pdb.set_trace()
|
||||||
|
|
||||||
|
name_parts = f_name.split(' ')
|
||||||
|
|
||||||
|
if name_parts[-1].lower() == '(optional)':
|
||||||
|
name_parts = name_parts[0:-1]
|
||||||
|
required = False
|
||||||
|
elif re.search('required', f_desc, re.IGNORECASE):
|
||||||
|
required = True
|
||||||
|
else:
|
||||||
|
required = None
|
||||||
|
|
||||||
|
f_name = u'_'.join(map(lambda x:x.lower(), name_parts))
|
||||||
|
f_name = re.sub(r'[^\w]','', f_name)
|
||||||
|
|
||||||
|
yield {
|
||||||
|
'name': f_name,
|
||||||
|
'desc': '(' + f_range + '). ' + f_desc,
|
||||||
|
'length': f_length,
|
||||||
|
'required': required,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _guess_field_types(self, entries):
|
||||||
|
lengthexp = LengthExpression()
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||||
|
|
||||||
|
for (classtype, criteria) in self.FIELD_TYPES:
|
||||||
|
if 'length' in criteria:
|
||||||
|
if not lengthexp(int(entry['length']), criteria['length']):
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 'regexp' in criteria:
|
||||||
|
for crit_key, crit_values in criteria['regexp'].items():
|
||||||
|
for crit_re in crit_values:
|
||||||
|
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
|
matches = list(matches.items())
|
||||||
|
matches.sort(key=lambda x:x[1])
|
||||||
|
|
||||||
|
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
||||||
|
|
||||||
|
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||||
|
yield entry
|
||||||
|
|
||||||
|
def _convert_to_records(self, entries):
|
||||||
|
blank_count = 1
|
||||||
|
for entry in entries:
|
||||||
|
result = []
|
||||||
|
add = result.append
|
||||||
|
|
||||||
|
# FIELD NAME
|
||||||
|
if entry['name'] == 'blank':
|
||||||
|
add( (u'blank%d' % blank_count).ljust(40) )
|
||||||
|
blank_count += 1
|
||||||
|
else:
|
||||||
|
add(entry['name'].ljust(40))
|
||||||
|
|
||||||
|
add(' = ')
|
||||||
|
|
||||||
|
if entry['guessed_type']:
|
||||||
|
add(entry['guessed_type'].__name__)
|
||||||
|
|
||||||
|
args = []
|
||||||
|
args.append("max_length=%d" % entry['length'])
|
||||||
|
if entry['required'] != None:
|
||||||
|
args.append("required=%s" % ('True' if entry['required'] else 'False'))
|
||||||
|
|
||||||
|
add("(" + ", ".join(args) + ")")
|
||||||
|
|
||||||
|
|
||||||
|
yield "".join(result)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class PastedDefParser(RecordBuilder):
|
||||||
|
|
||||||
def load(self, infile):
|
def load(self, infile):
|
||||||
tokens = self._tokenize(infile)
|
tokens = self._tokenize(infile)
|
||||||
entries = self._parse(tokens)
|
entries = self._parse(tokens)
|
||||||
|
@ -278,6 +375,7 @@ class PastedDefParser(object):
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
"""
|
||||||
def _guess_field_types(self, entries):
|
def _guess_field_types(self, entries):
|
||||||
lengthexp = LengthExpression()
|
lengthexp = LengthExpression()
|
||||||
|
|
||||||
|
@ -303,7 +401,8 @@ class PastedDefParser(object):
|
||||||
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||||
yield entry
|
yield entry
|
||||||
|
|
||||||
|
"""
|
||||||
|
"""
|
||||||
def _convert_to_records(self, entries):
|
def _convert_to_records(self, entries):
|
||||||
blank_count = 1
|
blank_count = 1
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
|
@ -332,4 +431,4 @@ class PastedDefParser(object):
|
||||||
|
|
||||||
yield "".join(result)
|
yield "".join(result)
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
|
@ -18,35 +18,12 @@ class PDFRecordFinder(object):
|
||||||
self.textrows = pdftext.split('\n')
|
self.textrows = pdftext.split('\n')
|
||||||
self.heading_exp = heading_exp
|
self.heading_exp = heading_exp
|
||||||
|
|
||||||
"""
|
|
||||||
def columns(self):
|
|
||||||
results = []
|
|
||||||
cc = ColumnCollector()
|
|
||||||
for heading, group in self.record_grouping():
|
|
||||||
print "HEADING", heading
|
|
||||||
for row in group:
|
|
||||||
if cc.is_next_field(row):
|
|
||||||
yield cc
|
|
||||||
cc = ColumnCollector()
|
|
||||||
#print row
|
|
||||||
try:
|
|
||||||
cc.add(row)
|
|
||||||
except UnknownColumn, e:
|
|
||||||
results.append(cc)
|
|
||||||
cc = ColumnCollector()
|
|
||||||
|
|
||||||
|
def records(self):
|
||||||
|
headings = self.locate_heading_rows()
|
||||||
def record_grouping(self):
|
for (start, end, name) in headings:
|
||||||
row_iter = iter(self.textrows)
|
name = name.decode('ascii', 'ignore')
|
||||||
i = 0
|
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))))
|
||||||
for row in row_iter:
|
|
||||||
i += 1
|
|
||||||
match = self.heading_exp.match(row)
|
|
||||||
if match:
|
|
||||||
yield (match.groups(), self.extract_record_columns(row_iter))
|
|
||||||
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def locate_heading_rows(self):
|
def locate_heading_rows(self):
|
||||||
|
@ -56,45 +33,32 @@ class PDFRecordFinder(object):
|
||||||
if match:
|
if match:
|
||||||
if not ''.join(match.groups()).lower().endswith('(continued)'):
|
if not ''.join(match.groups()).lower().endswith('(continued)'):
|
||||||
results.append((i, ''.join(match.groups())))
|
results.append((i, ''.join(match.groups())))
|
||||||
return results
|
|
||||||
|
|
||||||
|
merged = []
|
||||||
|
for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]):
|
||||||
|
merged.append( (a[0], b[0]-1, a[1]) )
|
||||||
|
|
||||||
def records2(self):
|
return merged
|
||||||
row_iter = iter(self.textrows)
|
|
||||||
record = True
|
|
||||||
while record:
|
|
||||||
record = self.extract_record(row_iter)
|
|
||||||
yield record
|
|
||||||
|
|
||||||
|
|
||||||
def extract_record(self, row_iter):
|
|
||||||
heading = self.find_heading(row_iter)
|
|
||||||
fields = self.find_fields(row_iter)
|
|
||||||
return heading, list(fields)
|
|
||||||
|
|
||||||
|
|
||||||
def find_heading(self, row_iter):
|
|
||||||
for row in row_iter:
|
|
||||||
heading_match = self.heading_exp.match(row)
|
|
||||||
if heading_match:
|
|
||||||
return heading_match.groups()
|
|
||||||
|
|
||||||
|
|
||||||
def find_fields(self, row_iter):
|
def find_fields(self, row_iter):
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
for r in row_iter:
|
for r in row_iter:
|
||||||
row = self.extract_columns_from_row(r)
|
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||||
|
|
||||||
if not row:
|
if not row:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if cc.is_next_field(row):
|
if cc.is_next_field(row):
|
||||||
yield cc
|
yield cc
|
||||||
cc = ColumnCollector()
|
cc = ColumnCollector()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
cc.add(row)
|
cc.add(row)
|
||||||
|
|
||||||
except UnknownColumn, e:
|
except UnknownColumn, e:
|
||||||
print 'UNKNOWN COLUMN', row
|
|
||||||
raise StopIteration
|
raise StopIteration
|
||||||
|
yield cc
|
||||||
|
|
||||||
|
|
||||||
def extract_columns_from_row(self, row):
|
def extract_columns_from_row(self, row):
|
||||||
|
@ -124,7 +88,7 @@ class PDFRecordFinder(object):
|
||||||
end = white_iter.next()
|
end = white_iter.next()
|
||||||
if start != end:
|
if start != end:
|
||||||
row_result.append(
|
row_result.append(
|
||||||
(start, row[start:end])
|
(start, row[start:end].encode('ascii','ignore'))
|
||||||
)
|
)
|
||||||
|
|
||||||
except StopIteration:
|
except StopIteration:
|
||||||
|
@ -133,78 +97,6 @@ class PDFRecordFinder(object):
|
||||||
return row_result
|
return row_result
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def extract_row_columns(self, row_iter):
|
|
||||||
re_multiwhite = re.compile(r'\s{2,}')
|
|
||||||
full_width_text_count = 0
|
|
||||||
|
|
||||||
#for r in row_iter:
|
|
||||||
row = None
|
|
||||||
while not row:
|
|
||||||
row = row_iter.next()
|
|
||||||
row = row.decode('UTF-8')
|
|
||||||
|
|
||||||
# IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
|
|
||||||
if not re_multiwhite.search(row):
|
|
||||||
full_width_text_count += 1
|
|
||||||
row = None
|
|
||||||
|
|
||||||
if True:
|
|
||||||
white_ranges = [0,]
|
|
||||||
pos = 0
|
|
||||||
match = True
|
|
||||||
while pos < len(row):
|
|
||||||
match = re_multiwhite.search(row[pos:])
|
|
||||||
if match:
|
|
||||||
white_ranges.append(pos + match.start())
|
|
||||||
white_ranges.append(pos + match.end())
|
|
||||||
pos += match.end()
|
|
||||||
else:
|
|
||||||
white_ranges.append(len(row))
|
|
||||||
pos = len(row)
|
|
||||||
|
|
||||||
|
|
||||||
row_result = []
|
|
||||||
white_iter = iter(white_ranges)
|
|
||||||
while white_iter:
|
|
||||||
try:
|
|
||||||
start = white_iter.next()
|
|
||||||
end = white_iter.next()
|
|
||||||
if start != end:
|
|
||||||
row_result.append(
|
|
||||||
(start, row[start:end])
|
|
||||||
)
|
|
||||||
|
|
||||||
except StopIteration:
|
|
||||||
white_iter = None
|
|
||||||
|
|
||||||
#print white_ranges
|
|
||||||
return row_result
|
|
||||||
#yield row_result
|
|
||||||
#result.append(row_result)
|
|
||||||
|
|
||||||
|
|
||||||
"""
|
|
||||||
row_result = []
|
|
||||||
pos = 0
|
|
||||||
while pos < len(row):
|
|
||||||
match = re_multiwhite.search(row[pos:])
|
|
||||||
if match and match.start() > 0:
|
|
||||||
row_result.append((
|
|
||||||
pos,
|
|
||||||
row[pos:pos+match.start()],))
|
|
||||||
pos += match.end()
|
|
||||||
else:
|
|
||||||
if match:
|
|
||||||
pos += match.end()
|
|
||||||
row_result.append((pos,row[pos:]))
|
|
||||||
pos += len(row)
|
|
||||||
result.append(row_result)
|
|
||||||
row_result = []
|
|
||||||
"""
|
|
||||||
#return result
|
|
||||||
|
|
||||||
|
|
||||||
class UnknownColumn(Exception):
|
class UnknownColumn(Exception):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -246,6 +138,8 @@ class ColumnCollector(object):
|
||||||
return self.data.keys()[0] == first_key
|
return self.data.keys()[0] == first_key
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
@property
|
||||||
|
def tuple(self):
|
||||||
|
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
||||||
|
|
||||||
|
|
||||||
|
|
47
scripts/pyaccuwage-pdfparse
Executable file
47
scripts/pyaccuwage-pdfparse
Executable file
|
@ -0,0 +1,47 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
from pyaccuwage.parser import RecordBuilder
|
||||||
|
from pyaccuwage.pdfexport import PDFRecordFinder
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.")
|
||||||
|
parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true")
|
||||||
|
parser.add_argument("-i", "--input", metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
"""
|
||||||
|
lines = []
|
||||||
|
for x in sys.stdin.readlines():
|
||||||
|
lines.append(x)
|
||||||
|
|
||||||
|
pdp = PastedDefParser()
|
||||||
|
tokens = pdp.load("".join(lines))
|
||||||
|
"""
|
||||||
|
def generate_imports():
|
||||||
|
return "\n".join([
|
||||||
|
"from pyaccuwage import model",
|
||||||
|
"from pyaccuwage.fields import *",
|
||||||
|
"",
|
||||||
|
"",
|
||||||
|
])
|
||||||
|
|
||||||
|
def generate_class_begin(name):
|
||||||
|
return "class %s(mode.Model):\n" % name
|
||||||
|
|
||||||
|
if args.full:
|
||||||
|
sys.stdout.write(generate_imports())
|
||||||
|
|
||||||
|
if args.classname:
|
||||||
|
classname = args.classname
|
||||||
|
else:
|
||||||
|
classname = "GeneratedRecord"
|
||||||
|
|
||||||
|
sys.stdout.write(generate_class_begin(classname))
|
||||||
|
|
||||||
|
|
||||||
|
for x in tokens:
|
||||||
|
sys.stdout.write('\t' + x + '\n')
|
||||||
|
|
||||||
|
|
4
setup.py
4
setup.py
|
@ -1,7 +1,7 @@
|
||||||
from distutils.core import setup
|
from distutils.core import setup
|
||||||
setup(name='pyaccuwage',
|
setup(name='pyaccuwage',
|
||||||
version='0.0.6',
|
version='0.0.7',
|
||||||
packages=['pyaccuwage'],
|
packages=['pyaccuwage'],
|
||||||
scripts=['scripts/pyaccuwage-parse'],
|
scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'],
|
||||||
zip_safe=True,
|
zip_safe=True,
|
||||||
)
|
)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue