diff --git a/pyaccuwage/fields.py b/pyaccuwage/fields.py index 9e38448..960db3e 100644 --- a/pyaccuwage/fields.py +++ b/pyaccuwage/fields.py @@ -12,7 +12,7 @@ class ValidationError(Exception): return "(%s.%s) %s" % (self.field.parent_name, self.field.name, self.msg) else: return repr(self.msg) - + class Field(object): creation_counter = 0 @@ -31,13 +31,13 @@ class Field(object): def get_data(self): raise NotImplemented - + def __setvalue(self, value): self._value = value - def __getvalue(self): + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) def read(self, fp): @@ -50,7 +50,7 @@ class Field(object): self.value = s.strip() -class TextField(Field): +class TextField(Field): def validate(self): if self.value == None and self.required: raise ValidationError("value required", field=self) @@ -65,7 +65,7 @@ class TextField(Field): class StateField(TextField): - def __init__(self, name=None, required=True, use_numeric=False): + def __init__(self, name=None, required=True, use_numeric=False, max_length=2): super(StateField, self).__init__(name=name, max_length=2, required=required) self.use_numeric = use_numeric @@ -90,7 +90,7 @@ class StateField(TextField): class EmailField(TextField): def __init__(self, name=None, required=True, max_length=None): - return super(EmailField, self).__init__(name=name, max_length=max_length, + return super(EmailField, self).__init__(name=name, max_length=max_length, required=required, uppercase=False) class IntegerField(TextField): @@ -101,7 +101,7 @@ class IntegerField(TextField): int(self.value) except ValueError: raise ValidationError("field contains non-numeric characters", field=self) - + def get_data(self): value = self.value or "" @@ -123,7 +123,7 @@ class StaticField(TextField): class BlankField(TextField): def __init__(self, name=None, max_length=0, required=False): super(TextField, self).__init__(name=name, max_length=max_length, required=required, uppercase=False) - + def get_data(self): return " " * self.max_length @@ -161,7 +161,7 @@ class MoneyField(Field): class DateField(TextField): def __init__(self, name=None, required=True, value=None): - super(TextField, self).__init__(name=name, required=required, max_length=8) + super(TextField, self).__init__(name=name, required=required, max_length=8) if value: self.value = value @@ -169,7 +169,7 @@ class DateField(TextField): if self._value: return self._value.strftime('%m%d%Y') return '0' * self.max_length - + def parse(self, s): if int(s) > 0: self.value = datetime.date(*[int(x) for x in s[4:8], s[0:2], s[2:4]]) @@ -184,9 +184,9 @@ class DateField(TextField): else: self._value = None - def __getvalue(self): + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) @@ -196,7 +196,7 @@ class MonthYearField(TextField): if value: self.value = value - + def get_data(self): if self._value: return self._value.strftime("%m%Y") @@ -215,9 +215,9 @@ class MonthYearField(TextField): self._value = datetime.date(*[int(x) for x in value[2:6], value[0:2], 1]) else: self._value = None - - def __getvalue(self): + + def __getvalue(self): return self._value - + value = property(__getvalue, __setvalue) diff --git a/pyaccuwage/model.py b/pyaccuwage/model.py index 9e206db..a8f83cb 100644 --- a/pyaccuwage/model.py +++ b/pyaccuwage/model.py @@ -6,6 +6,7 @@ import pdb class Model(object): record_identifier = ' ' required = False + target_size = 512 def __init__(self): for (key, value) in self.__class__.__dict__.items(): @@ -32,7 +33,7 @@ class Model(object): for key in self.__class__.__dict__.keys(): attr = getattr(self, key) if isinstance(attr, Field): - fields.append(attr) + fields.append(attr) return fields def get_sorted_fields(self): @@ -52,10 +53,16 @@ class Model(object): custom_validator(f) def output(self): - result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()]) - if len(result) != 512: - raise ValidationError("Record result length not equal to 512 bytes (%d)" % len(result)) - return result + result = ''.join([self.record_identifier] + + [field.get_data() for field in self.get_sorted_fields()]) + if len(result) != self.record_length: + raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.record_length, len(result))) + + #result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()]) + #if len(result) != self.target_size: + # raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.target_size, len(result))) + + return result def read(self, fp): for field in self.get_sorted_fields(): diff --git a/pyaccuwage/parser.py b/pyaccuwage/parser.py index 1b475ca..b367db1 100644 --- a/pyaccuwage/parser.py +++ b/pyaccuwage/parser.py @@ -132,7 +132,7 @@ class RecordBuilder(object): (fields.BlankField, { 'regexp': { 'name': [ - re.compile(r'^blank$'), + (re.compile(r'^blank$', re.IGNORECASE), +1), ], }, }), @@ -140,9 +140,13 @@ class RecordBuilder(object): (fields.MoneyField, { 'regexp': { 'desc': [ - re.compile(r'right\-justified'), - re.compile(r'amount'), - re.compile(r'zero\-filled'), + (re.compile(r'right\-justif', re.IGNORECASE), +1), + (re.compile(r'amount', re.IGNORECASE), +1), + (re.compile(r'zero\-filled', re.IGNORECASE), +1), + (re.compile(r'leading zeroes', re.IGNORECASE), +1), + + (re.compile(r'left-\justif', re.IGNORECASE), -1), + ], }, }), @@ -150,7 +154,7 @@ class RecordBuilder(object): (fields.TextField, { 'regexp': { 'desc': [ - re.compile(r'enter blanks'), + (re.compile(r'blanks', re.IGNORECASE), +1), ], }, }), @@ -158,12 +162,24 @@ class RecordBuilder(object): (fields.StateField, { 'regexp': { 'desc': [ - re.compile(r'state'), - re.compile(r'postal'), + (re.compile(r'state', re.IGNORECASE), +1), + (re.compile(r'postal', re.IGNORECASE), +1), ], }, 'length': ['=2'], - }) + }), + + (fields.IntegerField, { + 'regexp': { + 'desc': [ + (re.compile(r'right\-justif', re.IGNORECASE), +1), + (re.compile(r'leading zeroes', re.IGNORECASE), +1), + (re.compile(r'number', re.IGNORECASE), +1), + + (re.compile(r'left\-justif', re.IGNORECASE), -1), + ], + }, + }), ] def load(self, entries): @@ -233,8 +249,8 @@ class RecordBuilder(object): if 'regexp' in criteria: for crit_key, crit_values in criteria['regexp'].items(): - for crit_re in crit_values: - matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0 + for (crit_re, score) in crit_values: + matches[classtype] += score if crit_re.search(entry[crit_key]) else 0 matches = list(matches.items()) diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index cb60870..7660912 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -24,6 +24,9 @@ class PDFRecordFinder(object): def records(self): headings = self.locate_heading_rows_by_field() + #for x in headings: + # print x + for (start, end, name) in headings: name = name.decode('ascii', 'ignore') yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end)) @@ -108,45 +111,30 @@ class PDFRecordFinder(object): blank_row_counter = 0 for r in row_iter: - row = self.extract_columns_from_row(r.decode('UTF-8')) - if not row: - cc.empty_row() + row = r.decode('UTF-8') + #print row + row_columns = self.extract_columns_from_row(row) + + if not row_columns: + if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]: + yield cc + cc = ColumnCollector() + else: + cc.empty_row() continue try: - cc.add(row) + cc.add(row_columns) + except IsNextField, e: yield cc cc = ColumnCollector() - cc.add(row) + cc.add(row_columns) except UnknownColumn, e: raise StopIteration yield cc - def find_fields_old(self, row_iter): - cc = ColumnCollector() - - for r in row_iter: - row = self.extract_columns_from_row(r.decode('UTF-8')) - - if not row: - continue - - if cc.is_next_field(row): - #if row[1][1] == 'Vendor Indicator': - # import pdb - # pdb.set_trace() - yield cc - cc = ColumnCollector() - - try: - cc.add(row) - - except UnknownColumn, e: - raise StopIteration - yield cc - def extract_columns_from_row(self, row): re_multiwhite = re.compile(r'\s{2,}') @@ -202,9 +190,9 @@ class ColumnCollector(object): def __repr__(self): return "<%s: %s>" % ( - self.__class__.__name__, - map(lambda x:x if len(x) < 25 else x[:25] + '..', - self.data.values() if self.data else '' )) + self.__class__.__name__, + map(lambda x:x if len(x) < 25 else x[:25] + '..', + self.data.values() if self.data else '')) def add(self, data): #if self.empty_rows > 2: @@ -249,7 +237,6 @@ class ColumnCollector(object): def adjust_columns(self, data): adjusted_data = {} - for col_id, value in data: if col_id in self.data.keys(): adjusted_data[col_id] = value.strip() @@ -318,9 +305,11 @@ class ColumnCollector(object): @property def tuple(self): - try: + #try: + if self.data: return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) - except: - import pdb - pdb.set_trace() + return () + #except: + # import pdb + # pdb.set_trace() diff --git a/scripts/pyaccuwage-pdfparse b/scripts/pyaccuwage-pdfparse index 1b147c3..6a35387 100755 --- a/scripts/pyaccuwage-pdfparse +++ b/scripts/pyaccuwage-pdfparse @@ -14,15 +14,12 @@ args = parser.parse_args() def generate_imports(): return "\n".join([ - "from pyaccuwage import model", + "from pyaccuwage import model as pyaccuwagemodel", "from pyaccuwage.fields import *", "", "", ]) -def generate_class_begin(name): - return "class %s(mode.Model):\n" % name - if args.full: sys.stdout.write(generate_imports()) @@ -67,7 +64,9 @@ for rec in records: #print last_record_ends_at + 1, begins_at if last_record_ends_at + 1 != begins_at: - sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1])) + name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1]) + name = re.sub('[^\w]*', '', name) + sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name) for field in builder.load(map(lambda x:x.tuple, rec[1][0:])): sys.stdout.write('\t' + field + '\n') diff --git a/setup.py b/setup.py index 3e5bdf9..66da92f 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,6 @@ from distutils.core import setup setup(name='pyaccuwage', - version='0.2012.0', + version='0.2012.1', packages=['pyaccuwage'], scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'], zip_safe=True,