diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index 5db488f..a8e0df4 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -96,6 +96,8 @@ class RangeToken(BaseToken): @property def value(self): + if '-' not in self._value: + return int(self._value) return reduce(lambda x,y: y-x, map(int, self._value.split('-')))+1 @property @@ -115,8 +117,7 @@ class NumericToken(BaseToken): return int(self._value) - -class PastedDefParser(object): +class RecordBuilder(object): import fields TOKEN_TYPES = [ @@ -163,6 +164,102 @@ class PastedDefParser(object): }) ] + def load(self, entries): + # EXPECTS ENTRIES TO BE IN THE FORM OF + # [('1-5', 'Field Name', '5', 'Description of field.'),...] + entries = self._compile(entries) + entries = self._guess_field_types(entries) + entries = self._convert_to_records(entries) + return entries + + + def _compile(self, entries): + for (f_range, f_name, f_length, f_desc) in entries: + f_length = int(f_length) + try: + assert f_length == RangeToken(f_range).value + except AssertionError: + import pdb + pdb.set_trace() + + name_parts = f_name.split(' ') + + if name_parts[-1].lower() == '(optional)': + name_parts = name_parts[0:-1] + required = False + elif re.search('required', f_desc, re.IGNORECASE): + required = True + else: + required = None + + f_name = u'_'.join(map(lambda x:x.lower(), name_parts)) + f_name = re.sub(r'[^\w]','', f_name) + + yield { + 'name': f_name, + 'desc': '(' + f_range + '). ' + f_desc, + 'length': f_length, + 'required': required, + } + + + def _guess_field_types(self, entries): + lengthexp = LengthExpression() + + for entry in entries: + matches = dict(map(lambda x:(x[0],0), self.FIELD_TYPES)) + + for (classtype, criteria) in self.FIELD_TYPES: + if 'length' in criteria: + if not lengthexp(int(entry['length']), criteria['length']): + continue + + if 'regexp' in criteria: + for crit_key, crit_values in criteria['regexp'].items(): + for crit_re in crit_values: + matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 + + + matches = list(matches.items()) + matches.sort(key=lambda x:x[1]) + + matches_found = True if sum(map(lambda x:x[1], matches)) > 0 else False + + entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField + yield entry + + def _convert_to_records(self, entries): + blank_count = 1 + for entry in entries: + result = [] + add = result.append + + # FIELD NAME + if entry['name'] == 'blank': + add( (u'blank%d' % blank_count).ljust(40) ) + blank_count += 1 + else: + add(entry['name'].ljust(40)) + + add(' = ') + + if entry['guessed_type']: + add(entry['guessed_type'].__name__) + + args = [] + args.append("max_length=%d" % entry['length']) + if entry['required'] != None: + args.append("required=%s" % ('True' if entry['required'] else 'False')) + + add("(" + ", ".join(args) + ")") + + + yield "".join(result) + + + +class PastedDefParser(RecordBuilder): + def load(self, infile): tokens = self._tokenize(infile) entries = self._parse(tokens) @@ -276,8 +373,9 @@ class PastedDefParser(object): 'length': g['byterange'].value, 'required': required, }) + - + """ def _guess_field_types(self, entries): lengthexp = LengthExpression() @@ -302,8 +400,9 @@ class PastedDefParser(object): entry['guessed_type'] = matches[-1][0] if matches_found else self.fields.TextField yield entry - - + + """ + """ def _convert_to_records(self, entries): blank_count = 1 for entry in entries: @@ -332,4 +431,4 @@ class PastedDefParser(object): yield "".join(result) - + """ diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index 70e76b5..eeba7c5 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -18,35 +18,12 @@ class PDFRecordFinder(object): self.textrows = pdftext.split('\n') self.heading_exp = heading_exp - """ - def columns(self): - results = [] - cc = ColumnCollector() - for heading, group in self.record_grouping(): - print "HEADING", heading - for row in group: - if cc.is_next_field(row): - yield cc - cc = ColumnCollector() - #print row - try: - cc.add(row) - except UnknownColumn, e: - results.append(cc) - cc = ColumnCollector() - - - def record_grouping(self): - row_iter = iter(self.textrows) - i = 0 - for row in row_iter: - i += 1 - match = self.heading_exp.match(row) - if match: - yield (match.groups(), self.extract_record_columns(row_iter)) - - """ + def records(self): + headings = self.locate_heading_rows() + for (start, end, name) in headings: + name = name.decode('ascii', 'ignore') + yield (name, list(self.find_fields(iter(self.textrows[start+1:end])))) def locate_heading_rows(self): @@ -56,45 +33,32 @@ class PDFRecordFinder(object): if match: if not ''.join(match.groups()).lower().endswith('(continued)'): results.append((i, ''.join(match.groups()))) - return results + + merged = [] + for (a, b) in zip(results, results[1:] + [(len(self.textrows),None)]): + merged.append( (a[0], b[0]-1, a[1]) ) - - def records2(self): - row_iter = iter(self.textrows) - record = True - while record: - record = self.extract_record(row_iter) - yield record - - - def extract_record(self, row_iter): - heading = self.find_heading(row_iter) - fields = self.find_fields(row_iter) - return heading, list(fields) - - - def find_heading(self, row_iter): - for row in row_iter: - heading_match = self.heading_exp.match(row) - if heading_match: - return heading_match.groups() + return merged def find_fields(self, row_iter): cc = ColumnCollector() for r in row_iter: - row = self.extract_columns_from_row(r) + row = self.extract_columns_from_row(r.decode('UTF-8')) + if not row: continue if cc.is_next_field(row): yield cc cc = ColumnCollector() + try: cc.add(row) + except UnknownColumn, e: - print 'UNKNOWN COLUMN', row raise StopIteration + yield cc def extract_columns_from_row(self, row): @@ -124,7 +88,7 @@ class PDFRecordFinder(object): end = white_iter.next() if start != end: row_result.append( - (start, row[start:end]) + (start, row[start:end].encode('ascii','ignore')) ) except StopIteration: @@ -132,79 +96,7 @@ class PDFRecordFinder(object): return row_result - - - def extract_row_columns(self, row_iter): - re_multiwhite = re.compile(r'\s{2,}') - full_width_text_count = 0 - - #for r in row_iter: - row = None - while not row: - row = row_iter.next() - row = row.decode('UTF-8') - - # IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE - if not re_multiwhite.search(row): - full_width_text_count += 1 - row = None - - if True: - white_ranges = [0,] - pos = 0 - match = True - while pos < len(row): - match = re_multiwhite.search(row[pos:]) - if match: - white_ranges.append(pos + match.start()) - white_ranges.append(pos + match.end()) - pos += match.end() - else: - white_ranges.append(len(row)) - pos = len(row) - - - row_result = [] - white_iter = iter(white_ranges) - while white_iter: - try: - start = white_iter.next() - end = white_iter.next() - if start != end: - row_result.append( - (start, row[start:end]) - ) - - except StopIteration: - white_iter = None - - #print white_ranges - return row_result - #yield row_result - #result.append(row_result) - - - """ - row_result = [] - pos = 0 - while pos < len(row): - match = re_multiwhite.search(row[pos:]) - if match and match.start() > 0: - row_result.append(( - pos, - row[pos:pos+match.start()],)) - pos += match.end() - else: - if match: - pos += match.end() - row_result.append((pos,row[pos:])) - pos += len(row) - result.append(row_result) - row_result = [] - """ - #return result - - + class UnknownColumn(Exception): pass @@ -246,6 +138,8 @@ class ColumnCollector(object): return self.data.keys()[0] == first_key return False - + @property + def tuple(self): + return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse new file mode 100755 index 0000000..11a87ae --- /dev/null +++ b/scripts/pyaccuwage-pdfparse @@ -0,0 +1,47 @@ +#!/usr/bin/python +from pyaccuwage.parser import RecordBuilder +from pyaccuwage.pdfexport import PDFRecordFinder +import argparse +import sys + +parser = argparse.ArgumentParser(description="Parse and convert contents of IRS files into pyaccuwage e-file classes.") +parser.add_argument("-f", "--full", help="Generate full python file, including related imports.", action="store_true") +parser.add_argument("-i", "--input", metavar="file", type=argparse.FileType('r'), help="Source PDF file, ie: p1220.pdf") + + +args = parser.parse_args() + +""" +lines = [] +for x in sys.stdin.readlines(): + lines.append(x) + +pdp = PastedDefParser() +tokens = pdp.load("".join(lines)) +""" +def generate_imports(): + return "\n".join([ + "from pyaccuwage import model", + "from pyaccuwage.fields import *", + "", + "", + ]) + +def generate_class_begin(name): + return "class %s(mode.Model):\n" % name + +if args.full: + sys.stdout.write(generate_imports()) + +if args.classname: + classname = args.classname +else: + classname = "GeneratedRecord" + +sys.stdout.write(generate_class_begin(classname)) + + +for x in tokens: + sys.stdout.write('\t' + x + '\n') + + diff --git a/setup.py b/setup.py index 252e70b..c72be01 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,7 @@ from distutils.core import setup setup(name='pyaccuwage', - version='0.0.6', + version='0.0.7', packages=['pyaccuwage'], - scripts=['scripts/pyaccuwage-parse'], + scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'], zip_safe=True, )