From 2c9551f677b5968d36b1a04089100795b7b33a9c Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Wed, 18 Apr 2012 14:51:59 -0500 Subject: [PATCH] Fixed issue with last item not being insert into tokens. Now able to convert PDF text into record field definitions pretty reliably. Need to add additional field type detection rules. --- pyaccuwage/parser.py | 235 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 201 insertions(+), 34 deletions(-) diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index b103ee8..1e99f0e 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -2,8 +2,6 @@ # coding=UTF-8 import re - - class SimpleDefParser(object): def __init__(self): pass @@ -78,16 +76,34 @@ class NumericToken(BaseToken): class PastedDefParser(object): + import fields + TOKEN_TYPES = [ RangeToken, NumericToken, StringToken, ] + FIELD_TYPES = [ + (fields.BlankField, {'name': [ + re.compile(r'^blank$'), + ]}), + (fields.MoneyField, {'desc': [ + re.compile(r'right\-justified'), + re.compile(r'amount'), + re.compile(r'zero\-filled'), + ]}), + (fields.TextField, {'desc': [ + re.compile(r'enter blanks') + ]}) + ] + def load(self, infile): tokens = self._tokenize(infile) entries = self._parse(tokens) - #entries = self._compile(entries) + entries = self._compile(entries) + entries = self._guess_field_types(entries) + entries = self._convert_to_records(entries) return entries def _tokenize(self, data): @@ -100,6 +116,8 @@ class PastedDefParser(object): yield tclass(item) break + yield None + def _parse(self, tokens): # TODO group things based on strides between RangeTokens, probably # starting with range token, then the following BaseTokens are likely @@ -116,17 +134,6 @@ class PastedDefParser(object): # COLLECT TOKENS INTO GROUPS for token in tokens: - """ - if byte_pos == None: - # START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH - # A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE - # OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE. - - if isinstance(token, RangeToken): - byte_pos = token.end_position + 1 - elif isinstance(token, NumericToken): - byte_pos = token.value + 1 - """ if isinstance(token, NumericToken): # THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos # THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION, @@ -139,7 +146,7 @@ class PastedDefParser(object): token = RangeToken("%d-%d" % (token.value, token.value)) - if isinstance(token, RangeToken): + if isinstance(token, RangeToken) or token == None: if current_range: groups.append({ 'byterange': current_range, @@ -149,7 +156,8 @@ class PastedDefParser(object): }) # UPDATE RANGE POSITION - byte_pos = token.end_position + 1 + if token: + byte_pos = token.end_position + 1 current_range = token current_name = [] @@ -173,17 +181,75 @@ class PastedDefParser(object): def _compile(self, groups): for g in groups: assert g['byterange'].value == g['length'].value + + desc = u' '.join(map(lambda x:unicode(x.value), g['desc'])) + if g['name'][-1].value.lower() == '(optional)': + g['name'] = g['name'][0:-1] + required = False + elif re.search('required', desc, re.IGNORECASE): + required = True + else: + required = None + name = u'_'.join(map(lambda x:x.value.lower(), g['name'])) name = re.sub(r'[^\w]','', name) yield({ 'name': name, - 'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])), + 'desc': desc, 'length': g['byterange'].value, + 'required': required, }) + def _guess_field_types(self, entries): + for entry in entries: + matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) + + for (classtype, criteria) in self.FIELD_TYPES: + for crit_key, crit_values in criteria.items(): + for crit_re in crit_values: + matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 + + + matches = list(matches.items()) + matches.sort(key=lambda x:x[1]) + + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False + + entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField + yield entry + + + def _convert_to_records(self, entries): + blank_count = 1 + for entry in entries: + result = [] + add = result.append + + # FIELD NAME + if entry['name'] == 'blank': + add('blank%d' % blank_count) + blank_count += 1 + else: + add(entry['name']) + + add(' = ') + + if entry['guessed_type']: + add(entry['guessed_type'].__name__) + + args = [] + args.append("max_length=%d" % entry['length']) + if entry['required'] != None: + args.append("required=%s" % ('True' if entry['required'] else 'False')) + + add("(" + ", ".join(args) + ")") + + + yield "".join(result) + sdp = SimpleDefParser() tokens = sdp.load([ "record type,text,1", @@ -278,41 +344,142 @@ Amount 8* """) tokens3 = pdp.load(""" + 544-546 Blank 3 Enter blanks. -547 Trade or +547 Type of -Business +Payment Indicator -1 Enter “1” (one) to indicate the state or local income tax refund, +1 Enter the appropriate indicator from the following table; -credit, or offset (Amount Code 2) is attributable to income tax +otherwise, enter blanks. -that applies exclusively to income from a trade or business. +Indicator Usage -Indicator Usage +1 Per diem -1 Income tax refund applies exclusively to a trade or +2 Reimbursed amount -business. +548-556 Social Security -Blank Income tax refund is a general tax refund. +Number of -548-551 Tax Year of +Insured -Refund +9 Required. Enter the Social Security Number of the insured. -4 Enter the tax year for which the refund, credit, or +557-596 Name of Insured 40 Required. Enter the name of the insured. -offset (Amount Code 2) was issued. The tax year must reflect +597-636 Address of -the tax year for which the refund was made, not the tax year +Insured -of Form 1099-G. The tax year must be in the four-position +40 Required. Enter the address of the insured. The street address + +should include number, street, apartment or suite number (or PO + +Box if mail is not delivered to street address). Left-justify + +information and fill unused positions with blanks. This field + +must not contain any data other than the payee’s address. + +637-676 City of Insured 40 Required. Enter the city, town, or post office. Left-justify and + +fill unused positions with blanks. Enter APO or FPO, if + +applicable. Do not enter state and ZIP Code information in this + +field. + +677-678 State of Insured 2 Required. Enter the valid U.S. Postal Service state + +abbreviations for states or the appropriate postal identifier (AA, + +AE, or AP) described in Part A, Sec. 12. + +679-687 ZIP Code of + +Insured + +9 Required. Enter the valid nine-digit ZIP Code assigned by the + +U.S. Postal Service. If only the first five-digits are known, leftjustify information and fill the unused positions with blanks. + +For foreign countries, alpha characters are acceptable as long as + +the filer has entered a “1” (one) in the Foreign Country + +Indicator, located in position 247 of the “B” Record. + +688 Status of Illness + +Indicator + +(Optional) + +1 Enter the appropriate code from the table below to indicate the + +status of the illness of the insured; otherwise, enter blank. + +Indicator Usage + +1 Chronically ill + +2 Terminally ill + +689-696 Date Certified + +(Optional) + +8 Enter the latest date of a doctor's certification of the status of the + +insured's illness. The format of the date is YYYYMMDD (e.g., + +January 5, 2011, would be 20110105). Do not enter hyphens + +or slashes. + +697 Qualified + +Contract + +Indicator + +(Optional) + +1 Enter a “1” (one) if benefits were from a qualified long-term + +care insurance contract; otherwise, enter a blank. + +698-722 Blank 25 Enter blanks. + +723-734 State Income + +Tax Withheld + +12 State income tax withheld is for the convenience of the filers. + +This information does not need to be reported to IRS. Rightjustify information and fill unused positions with zeros. + +735-746 Local Income + +Tax Withheld + +12 Local income tax withheld is for the convenience of the filers. + +This information does not need to be reported to IRS. The + +payment amount must be right-justify information and fill + +unused positions with zeros. + +747-748 Blank 2 Enter blanks. + +749-750 Blank 2 Enter blanks or carriage return/line feed (CR/LF) characters. -format of YYYY (e.g., 2010). The valid range of years for the -refund is 2001 through 2010. """)