diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index bc62dd4..b103ee8 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -41,31 +41,53 @@ class BaseToken(object): regexp = re.compile('(.*)') def __init__(self, value): - self.value = value + self._value = value def match(self, value): return self.regexp def __repr__(self): - return ",".join([str(self.__class__), self.value]) + return ",".join([str(self.__class__), self._value]) + + +class StringToken(BaseToken): + regexp = re.compile('(.*)') + + @property + def value(self): + return self._value.decode('ascii','ignore') class RangeToken(BaseToken): regexp = re.compile('(\d+)-(\d+)') - + + @property + def value(self): + return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1 + + @property + def end_position(self): + return int(self._value.split('-')[1]) + class NumericToken(BaseToken): - regexp = re.compile('(\d+)') + regexp = re.compile('^(\d+)$') + + @property + def value(self): + return int(self._value) + class PastedDefParser(object): TOKEN_TYPES = [ RangeToken, NumericToken, - BaseToken, + StringToken, ] def load(self, infile): tokens = self._tokenize(infile) entries = self._parse(tokens) + #entries = self._compile(entries) return entries def _tokenize(self, data): @@ -82,10 +104,85 @@ class PastedDefParser(object): # TODO group things based on strides between RangeTokens, probably # starting with range token, then the following BaseTokens are likely # the field name, followed by a NumericToken, then Base/Numeric tokens - # for the field's description, until then ext RangeToken is found. - results = tokens + # for the field's description, until then next RangeToken is found. + groups = [] + current_range = None + current_name = [] + current_length = None + current_desc = [] + state = 'range' + + byte_pos = None + + # COLLECT TOKENS INTO GROUPS + for token in tokens: + """ + if byte_pos == None: + # START. IF byte_pos IS NONE, WE ASSUME WE'RE STARTING WITH + # A NEW SERIES OF FIELDS. THE FIRST VALUE WHICH IS A RANGE + # OR A NUMERIC FIELD IS ASSUMED TO BE THE FIELD POSITION/RANGE. + + if isinstance(token, RangeToken): + byte_pos = token.end_position + 1 + elif isinstance(token, NumericToken): + byte_pos = token.value + 1 + """ + if isinstance(token, NumericToken): + # THIS MAY BE A RANGE, IF THE VALUE MATCHES THE CURRENT byte_pos + # THEN WE ASSUME THIS TO BE A SINGLE BYTE RANGE DEFINITION, + # AND WE WILL TREAT IT AS A NEW RECORD. + if byte_pos == None or token.value == byte_pos: + # UPDATE RANGE POSITION + byte_pos = token.value + 1 + + # CONVERT TOKEN INTO RangeToken + token = RangeToken("%d-%d" % (token.value, token.value)) + + + if isinstance(token, RangeToken): + if current_range: + groups.append({ + 'byterange': current_range, + 'name': current_name, + 'length': current_length, + 'desc': current_desc, + }) + + # UPDATE RANGE POSITION + byte_pos = token.end_position + 1 + + current_range = token + current_name = [] + current_length = None + current_desc = [] + state = 'name' + + + elif state == 'name': + if isinstance(token, StringToken) and current_name and isinstance(current_name[-1], NumericToken): + current_length = current_name.pop() + state = 'desc' + else: + current_name.append(token) + if state == 'desc': + current_desc.append(token) + + results = groups return results + def _compile(self, groups): + for g in groups: + assert g['byterange'].value == g['length'].value + + name = u'_'.join(map(lambda x:x.value.lower(), g['name'])) + name = re.sub(r'[^\w]','', name) + + yield({ + 'name': name, + 'desc': u' '.join(map(lambda x:unicode(x.value), g['desc'])), + 'length': g['byterange'].value, + }) + sdp = SimpleDefParser() tokens = sdp.load([ @@ -97,6 +194,58 @@ tokens = sdp.load([ pdp = PastedDefParser() tokens2 = pdp.load(""" +544 Second TIN + +Notice + +(Optional) + +1 Enter “2” (two) to indicate notification by IRS twice within + +three calendar years that the payee provided an incorrect name + +and/or TIN combination; otherwise, enter a blank. + +545-546 Blank 2 Enter blanks. + +547-586 Foreign Country + +or U.S. + +Possession + +40 Enter the name of the foreign country or U.S. possession to + +which the withheld foreign tax (Amount Code 6) applies. + +Otherwise, enter blanks. + +587-599 CUSIP Number 13 Enter CUSIP Number. If the tax-exempt interest is reported in + +the aggregate for multiple bonds or accounts, enter: VARIOUS. + +Right-justify information and fill unused positions with blanks l. + +600-662 Blank 63 Enter blanks. + +663-722 Special Data + +Entries + +60 This portion of the “B” Record may be used to record + +information for state or local government reporting or for the + +filer's own purposes. Payers should contact the state or local + +revenue departments for filing requirements. You may enter + +your routing and transit number (RTN) here. If this field is not + +utilized, enter blanks. + + + 103-114 Payment Amount 5* @@ -127,3 +276,43 @@ Amount 8* 12 The amount reported in this field represents payments f """) + +tokens3 = pdp.load(""" +544-546 Blank 3 Enter blanks. + +547 Trade or + +Business + +Indicator + +1 Enter “1” (one) to indicate the state or local income tax refund, + +credit, or offset (Amount Code 2) is attributable to income tax + +that applies exclusively to income from a trade or business. + +Indicator Usage + +1 Income tax refund applies exclusively to a trade or + +business. + +Blank Income tax refund is a general tax refund. + +548-551 Tax Year of + +Refund + +4 Enter the tax year for which the refund, credit, or + +offset (Amount Code 2) was issued. The tax year must reflect + +the tax year for which the refund was made, not the tax year + +of Form 1099-G. The tax year must be in the four-position + +format of YYYY (e.g., 2010). The valid range of years for the + +refund is 2001 through 2010. +""")