Parser is mostly working, there's an issue with the last grouping of tokens

not being parsed. This can probably fixed by yielding an end-marker from the tokenizer generator so the compiler knows to clear out the last item.
2012-04-13 14:39:02 -05:00 · 2012-04-13 14:39:02 -05:00 · 027b44b65c
commit 027b44b65c
parent 6e9b8041b9
1 changed files with 196 additions and 7 deletions
--- a/pyaccuwage/parser.py
+++ b/pyaccuwage/parser.py
@ -41,31 +41,53 @@ class BaseToken(object):
    regexp = re.compile('(.*)')
    def __init__(self, value):
-        self.value = value
+        self._value = value
    def match(self, value):
        return self.regexp
    def __repr__(self):
-        return ",".join([str(self.__class__), self.value])
+        return ",".join([str(self.__class__), self._value])
 class StringToken(BaseToken):
    regexp = re.compile('(.*)')
    @property
    def value(self):
        return self._value.decode('ascii','ignore')
 class RangeToken(BaseToken):
    regexp = re.compile('(\d+)-(\d+)')
    @property
    def value(self):
        return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1
    @property
    def end_position(self):
        return int(self._value.split('-')[1])
 class NumericToken(BaseToken):
-    regexp = re.compile('(\d+)')
+    regexp = re.compile('^(\d+)$')
    @property
    def value(self):
        return int(self._value)
 class PastedDefParser(object):
    TOKEN_TYPES = [
        RangeToken,
        NumericToken,  
-        BaseToken,
+        StringToken,
    ]
    def load(self, infile):
        tokens = self._tokenize(infile)
        entries = self._parse(tokens)
        #entries = self._compile(entries)
        return entries
    def _tokenize(self, data):
@ -82,10 +104,85 @@ class PastedDefParser(object):
        # TODO group things based on strides between RangeTokens, probably
        # starting with range token, then the following BaseTokens are likely
        # the field name, followed by a NumericToken, then Base/Numeric tokens
-        # for the field's description, until then ext RangeToken is found.
+        # for the field's description, until then next RangeToken is found.
-        results = tokens
+        groups = []
        current_range = None
        current_name = []
        current_length = None
        current_desc = []
        state = 'range'
        byte_pos = None
        # COLLECT TOKENS INTO GROUPS
        for token in tokens:
            """
            if byte_pos == None:
                # START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH
                # A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE
                # OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE.
                if isinstance(token, RangeToken):
                    byte_pos = token.end_position + 1
                elif isinstance(token, NumericToken):
                    byte_pos = token.value + 1
            """
            if isinstance(token, NumericToken):
                # THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos
                # THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION,
                # AND WE WILL TREAT IT AS A NEW RECORD.
                if byte_pos == None or token.value == byte_pos:
                    # UPDATE RANGE POSITION
                    byte_pos = token.value + 1
                    # CONVERT TOKEN INTO RangeToken
                    token = RangeToken("%d-%d" % (token.value, token.value))
            if isinstance(token, RangeToken):
                if current_range:
                    groups.append({
                        'byterange': current_range,
                        'name': current_name,
                        'length': current_length,
                        'desc': current_desc,
                    })
                # UPDATE RANGE POSITION
                byte_pos = token.end_position + 1
                current_range = token
                current_name = []
                current_length = None
                current_desc = []
                state = 'name'
            elif state == 'name':
                if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken):
                    current_length = current_name.pop()
                    state = 'desc'
                else:
                    current_name.append(token)
            if state == 'desc':
                current_desc.append(token)
        results = groups
        return results
    def _compile(self, groups):
        for g in groups:
            assert g['byterange'].value == g['length'].value
            name = u'_'.join(map(lambda x:x.value.lower(), g['name']))
            name = re.sub(r'[^\w]','', name)
            yield({
                'name': name,
                'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])),
                'length': g['byterange'].value,
            })
 sdp = SimpleDefParser()
 tokens = sdp.load([
@ -97,6 +194,58 @@ tokens = sdp.load([
 pdp = PastedDefParser()
 tokens2 = pdp.load("""
 544 Second TIN 
 Notice 
 (Optional) 
 1 Enter “2” (two) to indicate notification by IRS twice within 
 three calendar years that the payee provided an incorrect name 
 and/or TIN combination; otherwise, enter a blank. 
 545-546 Blank 2 Enter blanks. 
 547-586 Foreign Country 
 or U.S. 
 Possession 
 40 Enter the name of the foreign country or U.S. possession to 
 which the withheld foreign tax (Amount Code 6) applies.  
 Otherwise, enter blanks. 
 587-599 CUSIP Number 13 Enter CUSIP Number. If the tax-exempt interest is reported in 
 the aggregate for multiple bonds or accounts, enter: VARIOUS.
 Right-justify information and fill unused positions with blanks l. 
 600-662 Blank 63 Enter blanks. 
 663-722 Special Data 
 Entries  
 60 This portion of the “B” Record may be used to record 
 information for state or local government reporting or for the  
 filer's own purposes.  Payers should contact the state or local 
 revenue departments for filing requirements.  You may enter 
 your routing and transit number (RTN) here.  If this field is not 
 utilized, enter blanks. 
 103-114 Payment 
 Amount 5*
@ -127,3 +276,43 @@ Amount 8*
 12 The amount reported in this field represents payments f
 """)
 tokens3 = pdp.load("""
 544-546 Blank 3 Enter blanks. 
 547 Trade or 
 Business 
 Indicator 
 1 Enter “1” (one) to indicate the state or local income tax refund, 
 credit, or offset (Amount Code 2) is attributable to income tax 
 that applies exclusively to income from a trade or business. 
 Indicator  Usage
 1    Income tax refund applies exclusively to a trade or 
 business. 
 Blank   Income tax refund is a general tax refund. 
 548-551 Tax Year of 
 Refund 
 4 Enter the tax year for which the refund, credit, or 
 offset (Amount Code 2) was issued.  The tax year must reflect 
 the tax year for which the refund was made, not the tax year 
 of Form 1099-G.  The tax year must be in the four-position 
 format of YYYY (e.g., 2010).  The valid range of years for the 
 refund is 2001 through 2010.
 """)