Fixed issue with last item not being insert into tokens. Now able to convert PDF text into record field definitions pretty reliably. Need to add additional field type detection rules.
This commit is contained in:
parent
027b44b65c
commit
2c9551f677
1 changed files with 201 additions and 34 deletions
|
@ -2,8 +2,6 @@
|
||||||
# coding=UTF-8
|
# coding=UTF-8
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class SimpleDefParser(object):
|
class SimpleDefParser(object):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
@ -78,16 +76,34 @@ class NumericToken(BaseToken):
|
||||||
|
|
||||||
|
|
||||||
class PastedDefParser(object):
|
class PastedDefParser(object):
|
||||||
|
import fields
|
||||||
|
|
||||||
TOKEN_TYPES = [
|
TOKEN_TYPES = [
|
||||||
RangeToken,
|
RangeToken,
|
||||||
NumericToken,
|
NumericToken,
|
||||||
StringToken,
|
StringToken,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
FIELD_TYPES = [
|
||||||
|
(fields.BlankField, {'name': [
|
||||||
|
re.compile(r'^blank$'),
|
||||||
|
]}),
|
||||||
|
(fields.MoneyField, {'desc': [
|
||||||
|
re.compile(r'right\-justified'),
|
||||||
|
re.compile(r'amount'),
|
||||||
|
re.compile(r'zero\-filled'),
|
||||||
|
]}),
|
||||||
|
(fields.TextField, {'desc': [
|
||||||
|
re.compile(r'enter blanks')
|
||||||
|
]})
|
||||||
|
]
|
||||||
|
|
||||||
def load(self, infile):
|
def load(self, infile):
|
||||||
tokens = self._tokenize(infile)
|
tokens = self._tokenize(infile)
|
||||||
entries = self._parse(tokens)
|
entries = self._parse(tokens)
|
||||||
#entries = self._compile(entries)
|
entries = self._compile(entries)
|
||||||
|
entries = self._guess_field_types(entries)
|
||||||
|
entries = self._convert_to_records(entries)
|
||||||
return entries
|
return entries
|
||||||
|
|
||||||
def _tokenize(self, data):
|
def _tokenize(self, data):
|
||||||
|
@ -100,6 +116,8 @@ class PastedDefParser(object):
|
||||||
yield tclass(item)
|
yield tclass(item)
|
||||||
break
|
break
|
||||||
|
|
||||||
|
yield None
|
||||||
|
|
||||||
def _parse(self, tokens):
|
def _parse(self, tokens):
|
||||||
# TODO group things based on strides between RangeTokens, probably
|
# TODO group things based on strides between RangeTokens, probably
|
||||||
# starting with range token, then the following BaseTokens are likely
|
# starting with range token, then the following BaseTokens are likely
|
||||||
|
@ -116,17 +134,6 @@ class PastedDefParser(object):
|
||||||
|
|
||||||
# COLLECT TOKENS INTO GROUPS
|
# COLLECT TOKENS INTO GROUPS
|
||||||
for token in tokens:
|
for token in tokens:
|
||||||
"""
|
|
||||||
if byte_pos == None:
|
|
||||||
# START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH
|
|
||||||
# A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE
|
|
||||||
# OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE.
|
|
||||||
|
|
||||||
if isinstance(token, RangeToken):
|
|
||||||
byte_pos = token.end_position + 1
|
|
||||||
elif isinstance(token, NumericToken):
|
|
||||||
byte_pos = token.value + 1
|
|
||||||
"""
|
|
||||||
if isinstance(token, NumericToken):
|
if isinstance(token, NumericToken):
|
||||||
# THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
|
# THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
|
||||||
# THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
|
# THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
|
||||||
|
@ -139,7 +146,7 @@ class PastedDefParser(object):
|
||||||
token = RangeToken("%d-%d" % (token.value, token.value))
|
token = RangeToken("%d-%d" % (token.value, token.value))
|
||||||
|
|
||||||
|
|
||||||
if isinstance(token, RangeToken):
|
if isinstance(token, RangeToken) or token == None:
|
||||||
if current_range:
|
if current_range:
|
||||||
groups.append({
|
groups.append({
|
||||||
'byterange': current_range,
|
'byterange': current_range,
|
||||||
|
@ -149,7 +156,8 @@ class PastedDefParser(object):
|
||||||
})
|
})
|
||||||
|
|
||||||
# UPDATE RANGE POSITION
|
# UPDATE RANGE POSITION
|
||||||
byte_pos = token.end_position + 1
|
if token:
|
||||||
|
byte_pos = token.end_position + 1
|
||||||
|
|
||||||
current_range = token
|
current_range = token
|
||||||
current_name = []
|
current_name = []
|
||||||
|
@ -173,17 +181,75 @@ class PastedDefParser(object):
|
||||||
def _compile(self, groups):
|
def _compile(self, groups):
|
||||||
for g in groups:
|
for g in groups:
|
||||||
assert g['byterange'].value == g['length'].value
|
assert g['byterange'].value == g['length'].value
|
||||||
|
|
||||||
|
desc = u' '.join(map(lambda x:unicode(x.value), g['desc']))
|
||||||
|
|
||||||
|
if g['name'][-1].value.lower() == '(optional)':
|
||||||
|
g['name'] = g['name'][0:-1]
|
||||||
|
required = False
|
||||||
|
elif re.search('required', desc, re.IGNORECASE):
|
||||||
|
required = True
|
||||||
|
else:
|
||||||
|
required = None
|
||||||
|
|
||||||
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
|
||||||
name = re.sub(r'[^\w]','', name)
|
name = re.sub(r'[^\w]','', name)
|
||||||
|
|
||||||
yield({
|
yield({
|
||||||
'name': name,
|
'name': name,
|
||||||
'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])),
|
'desc': desc,
|
||||||
'length': g['byterange'].value,
|
'length': g['byterange'].value,
|
||||||
|
'required': required,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _guess_field_types(self, entries):
|
||||||
|
for entry in entries:
|
||||||
|
matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES))
|
||||||
|
|
||||||
|
for (classtype, criteria) in self.FIELD_TYPES:
|
||||||
|
for crit_key, crit_values in criteria.items():
|
||||||
|
for crit_re in crit_values:
|
||||||
|
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||||
|
|
||||||
|
|
||||||
|
matches = list(matches.items())
|
||||||
|
matches.sort(key=lambda x:x[1])
|
||||||
|
|
||||||
|
matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False
|
||||||
|
|
||||||
|
entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField
|
||||||
|
yield entry
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_to_records(self, entries):
|
||||||
|
blank_count = 1
|
||||||
|
for entry in entries:
|
||||||
|
result = []
|
||||||
|
add = result.append
|
||||||
|
|
||||||
|
# FIELD NAME
|
||||||
|
if entry['name'] == 'blank':
|
||||||
|
add('blank%d' % blank_count)
|
||||||
|
blank_count += 1
|
||||||
|
else:
|
||||||
|
add(entry['name'])
|
||||||
|
|
||||||
|
add(' = ')
|
||||||
|
|
||||||
|
if entry['guessed_type']:
|
||||||
|
add(entry['guessed_type'].__name__)
|
||||||
|
|
||||||
|
args = []
|
||||||
|
args.append("max_length=%d" % entry['length'])
|
||||||
|
if entry['required'] != None:
|
||||||
|
args.append("required=%s" % ('True' if entry['required'] else 'False'))
|
||||||
|
|
||||||
|
add("(" + ", ".join(args) + ")")
|
||||||
|
|
||||||
|
|
||||||
|
yield "".join(result)
|
||||||
|
|
||||||
sdp = SimpleDefParser()
|
sdp = SimpleDefParser()
|
||||||
tokens = sdp.load([
|
tokens = sdp.load([
|
||||||
"record type,text,1",
|
"record type,text,1",
|
||||||
|
@ -278,41 +344,142 @@ Amount 8*
|
||||||
""")
|
""")
|
||||||
|
|
||||||
tokens3 = pdp.load("""
|
tokens3 = pdp.load("""
|
||||||
|
|
||||||
544-546 Blank 3 Enter blanks.
|
544-546 Blank 3 Enter blanks.
|
||||||
|
|
||||||
547 Trade or
|
547 Type of
|
||||||
|
|
||||||
Business
|
Payment
|
||||||
|
|
||||||
Indicator
|
Indicator
|
||||||
|
|
||||||
1 Enter “1” (one) to indicate the state or local income tax refund,
|
1 Enter the appropriate indicator from the following table;
|
||||||
|
|
||||||
credit, or offset (Amount Code 2) is attributable to income tax
|
otherwise, enter blanks.
|
||||||
|
|
||||||
that applies exclusively to income from a trade or business.
|
Indicator Usage
|
||||||
|
|
||||||
Indicator Usage
|
1 Per diem
|
||||||
|
|
||||||
1 Income tax refund applies exclusively to a trade or
|
2 Reimbursed amount
|
||||||
|
|
||||||
business.
|
548-556 Social Security
|
||||||
|
|
||||||
Blank Income tax refund is a general tax refund.
|
Number of
|
||||||
|
|
||||||
548-551 Tax Year of
|
Insured
|
||||||
|
|
||||||
Refund
|
9 Required. Enter the Social Security Number of the insured.
|
||||||
|
|
||||||
4 Enter the tax year for which the refund, credit, or
|
557-596 Name of Insured 40 Required. Enter the name of the insured.
|
||||||
|
|
||||||
offset (Amount Code 2) was issued. The tax year must reflect
|
597-636 Address of
|
||||||
|
|
||||||
the tax year for which the refund was made, not the tax year
|
Insured
|
||||||
|
|
||||||
of Form 1099-G. The tax year must be in the four-position
|
40 Required. Enter the address of the insured. The street address
|
||||||
|
|
||||||
|
should include number, street, apartment or suite number (or PO
|
||||||
|
|
||||||
|
Box if mail is not delivered to street address). Left-justify
|
||||||
|
|
||||||
|
information and fill unused positions with blanks. This field
|
||||||
|
|
||||||
|
must not contain any data other than the payee’s address.
|
||||||
|
|
||||||
|
637-676 City of Insured 40 Required. Enter the city, town, or post office. Left-justify and
|
||||||
|
|
||||||
|
fill unused positions with blanks. Enter APO or FPO, if
|
||||||
|
|
||||||
|
applicable. Do not enter state and ZIP Code information in this
|
||||||
|
|
||||||
|
field.
|
||||||
|
|
||||||
|
677-678 State of Insured 2 Required. Enter the valid U.S. Postal Service state
|
||||||
|
|
||||||
|
abbreviations for states or the appropriate postal identifier (AA,
|
||||||
|
|
||||||
|
AE, or AP) described in Part A, Sec. 12.
|
||||||
|
|
||||||
|
679-687 ZIP Code of
|
||||||
|
|
||||||
|
Insured
|
||||||
|
|
||||||
|
9 Required. Enter the valid nine-digit ZIP Code assigned by the
|
||||||
|
|
||||||
|
U.S. Postal Service. If only the first five-digits are known, leftjustify information and fill the unused positions with blanks.
|
||||||
|
|
||||||
|
For foreign countries, alpha characters are acceptable as long as
|
||||||
|
|
||||||
|
the filer has entered a “1” (one) in the Foreign Country
|
||||||
|
|
||||||
|
Indicator, located in position 247 of the “B” Record.
|
||||||
|
|
||||||
|
688 Status of Illness
|
||||||
|
|
||||||
|
Indicator
|
||||||
|
|
||||||
|
(Optional)
|
||||||
|
|
||||||
|
1 Enter the appropriate code from the table below to indicate the
|
||||||
|
|
||||||
|
status of the illness of the insured; otherwise, enter blank.
|
||||||
|
|
||||||
|
Indicator Usage
|
||||||
|
|
||||||
|
1 Chronically ill
|
||||||
|
|
||||||
|
2 Terminally ill
|
||||||
|
|
||||||
|
689-696 Date Certified
|
||||||
|
|
||||||
|
(Optional)
|
||||||
|
|
||||||
|
8 Enter the latest date of a doctor's certification of the status of the
|
||||||
|
|
||||||
|
insured's illness. The format of the date is YYYYMMDD (e.g.,
|
||||||
|
|
||||||
|
January 5, 2011, would be 20110105). Do not enter hyphens
|
||||||
|
|
||||||
|
or slashes.
|
||||||
|
|
||||||
|
697 Qualified
|
||||||
|
|
||||||
|
Contract
|
||||||
|
|
||||||
|
Indicator
|
||||||
|
|
||||||
|
(Optional)
|
||||||
|
|
||||||
|
1 Enter a “1” (one) if benefits were from a qualified long-term
|
||||||
|
|
||||||
|
care insurance contract; otherwise, enter a blank.
|
||||||
|
|
||||||
|
698-722 Blank 25 Enter blanks.
|
||||||
|
|
||||||
|
723-734 State Income
|
||||||
|
|
||||||
|
Tax Withheld
|
||||||
|
|
||||||
|
12 State income tax withheld is for the convenience of the filers.
|
||||||
|
|
||||||
|
This information does not need to be reported to IRS. Rightjustify information and fill unused positions with zeros.
|
||||||
|
|
||||||
|
735-746 Local Income
|
||||||
|
|
||||||
|
Tax Withheld
|
||||||
|
|
||||||
|
12 Local income tax withheld is for the convenience of the filers.
|
||||||
|
|
||||||
|
This information does not need to be reported to IRS. The
|
||||||
|
|
||||||
|
payment amount must be right-justify information and fill
|
||||||
|
|
||||||
|
unused positions with zeros.
|
||||||
|
|
||||||
|
747-748 Blank 2 Enter blanks.
|
||||||
|
|
||||||
|
749-750 Blank 2 Enter blanks or carriage return/line feed (CR/LF) characters.
|
||||||
|
|
||||||
format of YYYY (e.g., 2010). The valid range of years for the
|
|
||||||
|
|
||||||
refund is 2001 through 2010.
|
|
||||||
""")
|
""")
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue