Fixed issue with last item not being insert into tokens. Now able to convert PDF text into record field definitions pretty reliably. Need to add additional field type detection rules.

This commit is contained in:
Binh 2012-04-18 14:51:59 -05:00
parent 027b44b65c
commit 2c9551f677

View file

@ -2,8 +2,6 @@
# coding=UTF-8 # coding=UTF-8
import re import re
class SimpleDefParser(object): class SimpleDefParser(object):
def __init__(self): def __init__(self):
pass pass
@ -78,16 +76,34 @@ class NumericToken(BaseToken):
class PastedDefParser(object): class PastedDefParser(object):
import fields
TOKEN_TYPES = [ TOKEN_TYPES = [
RangeToken, RangeToken,
NumericToken, NumericToken,
StringToken, StringToken,
] ]
FIELD_TYPES = [
(fields.BlankField, {'name': [
re.compile(r'^blank$'),
]}),
(fields.MoneyField, {'desc': [
re.compile(r'right\-justified'),
re.compile(r'amount'),
re.compile(r'zero\-filled'),
]}),
(fields.TextField, {'desc': [
re.compile(r'enter blanks')
]})
]
def load(self, infile): def load(self, infile):
tokens = self._tokenize(infile) tokens = self._tokenize(infile)
entries = self._parse(tokens) entries = self._parse(tokens)
#entries = self._compile(entries) entries = self._compile(entries)
entries = self._guess_field_types(entries)
entries = self._convert_to_records(entries)
return entries return entries
def _tokenize(self, data): def _tokenize(self, data):
@ -100,6 +116,8 @@ class PastedDefParser(object):
yield tclass(item) yield tclass(item)
break break
yield None
def _parse(self, tokens): def _parse(self, tokens):
# TODO group things based on strides between RangeTokens, probably # TODO group things based on strides between RangeTokens, probably
# starting with range token, then the following BaseTokens are likely # starting with range token, then the following BaseTokens are likely
@ -116,17 +134,6 @@ class PastedDefParser(object):
# COLLECT TOKENS INTO GROUPS # COLLECT TOKENS INTO GROUPS
for token in tokens: for token in tokens:
"""
if byte_pos == None:
# START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH
# A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE
# OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE.
if isinstance(token, RangeToken):
byte_pos = token.end_position + 1
elif isinstance(token, NumericToken):
byte_pos = token.value + 1
"""
if isinstance(token, NumericToken): if isinstance(token, NumericToken):
# THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos # THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
# THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION, # THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
@ -139,7 +146,7 @@ class PastedDefParser(object):
token = RangeToken("%d-%d" % (token.value, token.value)) token = RangeToken("%d-%d" % (token.value, token.value))
if isinstance(token, RangeToken): if isinstance(token, RangeToken) or token == None:
if current_range: if current_range:
groups.append({ groups.append({
'byterange': current_range, 'byterange': current_range,
@ -149,7 +156,8 @@ class PastedDefParser(object):
}) })
# UPDATE RANGE POSITION # UPDATE RANGE POSITION
byte_pos = token.end_position + 1 if token:
byte_pos = token.end_position + 1
current_range = token current_range = token
current_name = [] current_name = []
@ -173,17 +181,75 @@ class PastedDefParser(object):
def _compile(self, groups): def _compile(self, groups):
for g in groups: for g in groups:
assert g['byterange'].value == g['length'].value assert g['byterange'].value == g['length'].value
desc = u' '.join(map(lambda x:unicode(x.value), g['desc']))
if g['name'][-1].value.lower() == '(optional)':
g['name'] = g['name'][0:-1]
required = False
elif re.search('required', desc, re.IGNORECASE):
required = True
else:
required = None
name = u'_'.join(map(lambda x:x.value.lower(), g['name'])) name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
name = re.sub(r'[^\w]','', name) name = re.sub(r'[^\w]','', name)
yield({ yield({
'name': name, 'name': name,
'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])), 'desc': desc,
'length': g['byterange'].value, 'length': g['byterange'].value,
'required': required,
}) })
def _guess_field_types(self, entries):
for entry in entries:
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
for (classtype, criteria) in self.FIELD_TYPES:
for crit_key, crit_values in criteria.items():
for crit_re in crit_values:
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
matches = list(matches.items())
matches.sort(key=lambda x:x[1])
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
yield entry
def _convert_to_records(self, entries):
blank_count = 1
for entry in entries:
result = []
add = result.append
# FIELD NAME
if entry['name'] == 'blank':
add('blank%d' % blank_count)
blank_count += 1
else:
add(entry['name'])
add(' = ')
if entry['guessed_type']:
add(entry['guessed_type'].__name__)
args = []
args.append("max_length=%d" % entry['length'])
if entry['required'] != None:
args.append("required=%s" % ('True' if entry['required'] else 'False'))
add("(" + ", ".join(args) + ")")
yield "".join(result)
sdp = SimpleDefParser() sdp = SimpleDefParser()
tokens = sdp.load([ tokens = sdp.load([
"record type,text,1", "record type,text,1",
@ -278,41 +344,142 @@ Amount 8*
""") """)
tokens3 = pdp.load(""" tokens3 = pdp.load("""
544-546 Blank 3 Enter blanks. 544-546 Blank 3 Enter blanks.
547 Trade or 547 Type of
Business Payment
Indicator Indicator
1 Enter 1 (one) to indicate the state or local income tax refund, 1 Enter the appropriate indicator from the following table;
credit, or offset (Amount Code 2) is attributable to income tax otherwise, enter blanks.
that applies exclusively to income from a trade or business. Indicator Usage
Indicator Usage 1 Per diem
1 Income tax refund applies exclusively to a trade or 2 Reimbursed amount
business. 548-556 Social Security
Blank Income tax refund is a general tax refund. Number of
548-551 Tax Year of Insured
Refund 9 Required. Enter the Social Security Number of the insured.
4 Enter the tax year for which the refund, credit, or 557-596 Name of Insured 40 Required. Enter the name of the insured.
offset (Amount Code 2) was issued. The tax year must reflect 597-636 Address of
the tax year for which the refund was made, not the tax year Insured
of Form 1099-G. The tax year must be in the four-position 40 Required. Enter the address of the insured. The street address
should include number, street, apartment or suite number (or PO
Box if mail is not delivered to street address). Left-justify
information and fill unused positions with blanks. This field
must not contain any data other than the payees address.
637-676 City of Insured 40 Required. Enter the city, town, or post office. Left-justify and
fill unused positions with blanks. Enter APO or FPO, if
applicable. Do not enter state and ZIP Code information in this
field.
677-678 State of Insured 2 Required. Enter the valid U.S. Postal Service state
abbreviations for states or the appropriate postal identifier (AA,
AE, or AP) described in Part A, Sec. 12.
679-687 ZIP Code of
Insured
9 Required. Enter the valid nine-digit ZIP Code assigned by the
U.S. Postal Service. If only the first five-digits are known, leftjustify information and fill the unused positions with blanks.
For foreign countries, alpha characters are acceptable as long as
the filer has entered a 1 (one) in the Foreign Country
Indicator, located in position 247 of the B Record.
688 Status of Illness
Indicator
(Optional)
1 Enter the appropriate code from the table below to indicate the
status of the illness of the insured; otherwise, enter blank.
Indicator Usage
1 Chronically ill
2 Terminally ill
689-696 Date Certified
(Optional)
8 Enter the latest date of a doctor's certification of the status of the
insured's illness. The format of the date is YYYYMMDD (e.g.,
January 5, 2011, would be 20110105). Do not enter hyphens
or slashes.
697 Qualified
Contract
Indicator
(Optional)
1 Enter a 1 (one) if benefits were from a qualified long-term
care insurance contract; otherwise, enter a blank.
698-722 Blank 25 Enter blanks.
723-734 State Income
Tax Withheld
12 State income tax withheld is for the convenience of the filers.
This information does not need to be reported to IRS. Rightjustify information and fill unused positions with zeros.
735-746 Local Income
Tax Withheld
12 Local income tax withheld is for the convenience of the filers.
This information does not need to be reported to IRS. The
payment amount must be right-justify information and fill
unused positions with zeros.
747-748 Blank 2 Enter blanks.
749-750 Blank 2 Enter blanks or carriage return/line feed (CR/LF) characters.
format of YYYY (e.g., 2010). The valid range of years for the
refund is 2001 through 2010.
""") """)