Merge branch 'master' of brimstone.klowner.com:pyaccuwage
Conflicts: pyaccuwage/pdfextract.py
This commit is contained in:
commit
456c15eb1c
6 changed files with 85 additions and 74 deletions
|
@ -12,7 +12,7 @@ class ValidationError(Exception):
|
|||
return "(%s.%s) %s" % (self.field.parent_name, self.field.name, self.msg)
|
||||
else:
|
||||
return repr(self.msg)
|
||||
|
||||
|
||||
|
||||
class Field(object):
|
||||
creation_counter = 0
|
||||
|
@ -31,13 +31,13 @@ class Field(object):
|
|||
|
||||
def get_data(self):
|
||||
raise NotImplemented
|
||||
|
||||
|
||||
def __setvalue(self, value):
|
||||
self._value = value
|
||||
|
||||
def __getvalue(self):
|
||||
def __getvalue(self):
|
||||
return self._value
|
||||
|
||||
|
||||
value = property(__getvalue, __setvalue)
|
||||
|
||||
def read(self, fp):
|
||||
|
@ -50,7 +50,7 @@ class Field(object):
|
|||
self.value = s.strip()
|
||||
|
||||
|
||||
class TextField(Field):
|
||||
class TextField(Field):
|
||||
def validate(self):
|
||||
if self.value == None and self.required:
|
||||
raise ValidationError("value required", field=self)
|
||||
|
@ -65,7 +65,7 @@ class TextField(Field):
|
|||
|
||||
|
||||
class StateField(TextField):
|
||||
def __init__(self, name=None, required=True, use_numeric=False):
|
||||
def __init__(self, name=None, required=True, use_numeric=False, max_length=2):
|
||||
super(StateField, self).__init__(name=name, max_length=2, required=required)
|
||||
self.use_numeric = use_numeric
|
||||
|
||||
|
@ -90,7 +90,7 @@ class StateField(TextField):
|
|||
|
||||
class EmailField(TextField):
|
||||
def __init__(self, name=None, required=True, max_length=None):
|
||||
return super(EmailField, self).__init__(name=name, max_length=max_length,
|
||||
return super(EmailField, self).__init__(name=name, max_length=max_length,
|
||||
required=required, uppercase=False)
|
||||
|
||||
class IntegerField(TextField):
|
||||
|
@ -101,7 +101,7 @@ class IntegerField(TextField):
|
|||
int(self.value)
|
||||
except ValueError:
|
||||
raise ValidationError("field contains non-numeric characters", field=self)
|
||||
|
||||
|
||||
|
||||
def get_data(self):
|
||||
value = self.value or ""
|
||||
|
@ -123,7 +123,7 @@ class StaticField(TextField):
|
|||
class BlankField(TextField):
|
||||
def __init__(self, name=None, max_length=0, required=False):
|
||||
super(TextField, self).__init__(name=name, max_length=max_length, required=required, uppercase=False)
|
||||
|
||||
|
||||
def get_data(self):
|
||||
return " " * self.max_length
|
||||
|
||||
|
@ -161,7 +161,7 @@ class MoneyField(Field):
|
|||
|
||||
class DateField(TextField):
|
||||
def __init__(self, name=None, required=True, value=None):
|
||||
super(TextField, self).__init__(name=name, required=required, max_length=8)
|
||||
super(TextField, self).__init__(name=name, required=required, max_length=8)
|
||||
if value:
|
||||
self.value = value
|
||||
|
||||
|
@ -169,7 +169,7 @@ class DateField(TextField):
|
|||
if self._value:
|
||||
return self._value.strftime('%m%d%Y')
|
||||
return '0' * self.max_length
|
||||
|
||||
|
||||
def parse(self, s):
|
||||
if int(s) > 0:
|
||||
self.value = datetime.date(*[int(x) for x in s[4:8], s[0:2], s[2:4]])
|
||||
|
@ -184,9 +184,9 @@ class DateField(TextField):
|
|||
else:
|
||||
self._value = None
|
||||
|
||||
def __getvalue(self):
|
||||
def __getvalue(self):
|
||||
return self._value
|
||||
|
||||
|
||||
value = property(__getvalue, __setvalue)
|
||||
|
||||
|
||||
|
@ -196,7 +196,7 @@ class MonthYearField(TextField):
|
|||
|
||||
if value:
|
||||
self.value = value
|
||||
|
||||
|
||||
def get_data(self):
|
||||
if self._value:
|
||||
return self._value.strftime("%m%Y")
|
||||
|
@ -215,9 +215,9 @@ class MonthYearField(TextField):
|
|||
self._value = datetime.date(*[int(x) for x in value[2:6], value[0:2], 1])
|
||||
else:
|
||||
self._value = None
|
||||
|
||||
def __getvalue(self):
|
||||
|
||||
def __getvalue(self):
|
||||
return self._value
|
||||
|
||||
|
||||
value = property(__getvalue, __setvalue)
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import pdb
|
|||
class Model(object):
|
||||
record_identifier = ' '
|
||||
required = False
|
||||
target_size = 512
|
||||
|
||||
def __init__(self):
|
||||
for (key, value) in self.__class__.__dict__.items():
|
||||
|
@ -32,7 +33,7 @@ class Model(object):
|
|||
for key in self.__class__.__dict__.keys():
|
||||
attr = getattr(self, key)
|
||||
if isinstance(attr, Field):
|
||||
fields.append(attr)
|
||||
fields.append(attr)
|
||||
return fields
|
||||
|
||||
def get_sorted_fields(self):
|
||||
|
@ -52,10 +53,16 @@ class Model(object):
|
|||
custom_validator(f)
|
||||
|
||||
def output(self):
|
||||
result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()])
|
||||
if len(result) != 512:
|
||||
raise ValidationError("Record result length not equal to 512 bytes (%d)" % len(result))
|
||||
return result
|
||||
result = ''.join([self.record_identifier] +
|
||||
[field.get_data() for field in self.get_sorted_fields()])
|
||||
if len(result) != self.record_length:
|
||||
raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.record_length, len(result)))
|
||||
|
||||
#result = ''.join([self.record_identifier] + [field.get_data() for field in self.get_sorted_fields()])
|
||||
#if len(result) != self.target_size:
|
||||
# raise ValidationError("Record result length not equal to %d bytes (%d)" % (self.target_size, len(result)))
|
||||
|
||||
return result
|
||||
|
||||
def read(self, fp):
|
||||
for field in self.get_sorted_fields():
|
||||
|
|
|
@ -132,7 +132,7 @@ class RecordBuilder(object):
|
|||
(fields.BlankField, {
|
||||
'regexp': {
|
||||
'name': [
|
||||
re.compile(r'^blank$'),
|
||||
(re.compile(r'^blank$', re.IGNORECASE), +1),
|
||||
],
|
||||
},
|
||||
}),
|
||||
|
@ -140,9 +140,13 @@ class RecordBuilder(object):
|
|||
(fields.MoneyField, {
|
||||
'regexp': {
|
||||
'desc': [
|
||||
re.compile(r'right\-justified'),
|
||||
re.compile(r'amount'),
|
||||
re.compile(r'zero\-filled'),
|
||||
(re.compile(r'right\-justif', re.IGNORECASE), +1),
|
||||
(re.compile(r'amount', re.IGNORECASE), +1),
|
||||
(re.compile(r'zero\-filled', re.IGNORECASE), +1),
|
||||
(re.compile(r'leading zeroes', re.IGNORECASE), +1),
|
||||
|
||||
(re.compile(r'left-\justif', re.IGNORECASE), -1),
|
||||
|
||||
],
|
||||
},
|
||||
}),
|
||||
|
@ -150,7 +154,7 @@ class RecordBuilder(object):
|
|||
(fields.TextField, {
|
||||
'regexp': {
|
||||
'desc': [
|
||||
re.compile(r'enter blanks'),
|
||||
(re.compile(r'blanks', re.IGNORECASE), +1),
|
||||
],
|
||||
},
|
||||
}),
|
||||
|
@ -158,12 +162,24 @@ class RecordBuilder(object):
|
|||
(fields.StateField, {
|
||||
'regexp': {
|
||||
'desc': [
|
||||
re.compile(r'state'),
|
||||
re.compile(r'postal'),
|
||||
(re.compile(r'state', re.IGNORECASE), +1),
|
||||
(re.compile(r'postal', re.IGNORECASE), +1),
|
||||
],
|
||||
},
|
||||
'length': ['=2'],
|
||||
})
|
||||
}),
|
||||
|
||||
(fields.IntegerField, {
|
||||
'regexp': {
|
||||
'desc': [
|
||||
(re.compile(r'right\-justif', re.IGNORECASE), +1),
|
||||
(re.compile(r'leading zeroes', re.IGNORECASE), +1),
|
||||
(re.compile(r'number', re.IGNORECASE), +1),
|
||||
|
||||
(re.compile(r'left\-justif', re.IGNORECASE), -1),
|
||||
],
|
||||
},
|
||||
}),
|
||||
]
|
||||
|
||||
def load(self, entries):
|
||||
|
@ -233,8 +249,8 @@ class RecordBuilder(object):
|
|||
|
||||
if 'regexp' in criteria:
|
||||
for crit_key, crit_values in criteria['regexp'].items():
|
||||
for crit_re in crit_values:
|
||||
matches[classtype] += 1 if crit_re.search(entry[crit_key]) else 0
|
||||
for (crit_re, score) in crit_values:
|
||||
matches[classtype] += score if crit_re.search(entry[crit_key]) else 0
|
||||
|
||||
|
||||
matches = list(matches.items())
|
||||
|
|
|
@ -24,6 +24,9 @@ class PDFRecordFinder(object):
|
|||
def records(self):
|
||||
headings = self.locate_heading_rows_by_field()
|
||||
|
||||
#for x in headings:
|
||||
# print x
|
||||
|
||||
for (start, end, name) in headings:
|
||||
name = name.decode('ascii', 'ignore')
|
||||
yield (name, list(self.find_fields(iter(self.textrows[start+1:end]))), (start+1, end))
|
||||
|
@ -108,45 +111,30 @@ class PDFRecordFinder(object):
|
|||
blank_row_counter = 0
|
||||
|
||||
for r in row_iter:
|
||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||
if not row:
|
||||
cc.empty_row()
|
||||
row = r.decode('UTF-8')
|
||||
#print row
|
||||
row_columns = self.extract_columns_from_row(row)
|
||||
|
||||
if not row_columns:
|
||||
if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]:
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
else:
|
||||
cc.empty_row()
|
||||
continue
|
||||
|
||||
try:
|
||||
cc.add(row)
|
||||
cc.add(row_columns)
|
||||
|
||||
except IsNextField, e:
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
cc.add(row)
|
||||
cc.add(row_columns)
|
||||
except UnknownColumn, e:
|
||||
raise StopIteration
|
||||
|
||||
yield cc
|
||||
|
||||
def find_fields_old(self, row_iter):
|
||||
cc = ColumnCollector()
|
||||
|
||||
for r in row_iter:
|
||||
row = self.extract_columns_from_row(r.decode('UTF-8'))
|
||||
|
||||
if not row:
|
||||
continue
|
||||
|
||||
if cc.is_next_field(row):
|
||||
#if row[1][1] == 'Vendor Indicator':
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
yield cc
|
||||
cc = ColumnCollector()
|
||||
|
||||
try:
|
||||
cc.add(row)
|
||||
|
||||
except UnknownColumn, e:
|
||||
raise StopIteration
|
||||
yield cc
|
||||
|
||||
|
||||
def extract_columns_from_row(self, row):
|
||||
re_multiwhite = re.compile(r'\s{2,}')
|
||||
|
@ -202,9 +190,9 @@ class ColumnCollector(object):
|
|||
|
||||
def __repr__(self):
|
||||
return "<%s: %s>" % (
|
||||
self.__class__.__name__,
|
||||
map(lambda x:x if len(x) < 25 else x[:25] + '..',
|
||||
self.data.values() if self.data else '' ))
|
||||
self.__class__.__name__,
|
||||
map(lambda x:x if len(x) < 25 else x[:25] + '..',
|
||||
self.data.values() if self.data else ''))
|
||||
|
||||
def add(self, data):
|
||||
#if self.empty_rows > 2:
|
||||
|
@ -249,7 +237,6 @@ class ColumnCollector(object):
|
|||
|
||||
def adjust_columns(self, data):
|
||||
adjusted_data = {}
|
||||
|
||||
for col_id, value in data:
|
||||
if col_id in self.data.keys():
|
||||
adjusted_data[col_id] = value.strip()
|
||||
|
@ -318,9 +305,11 @@ class ColumnCollector(object):
|
|||
|
||||
@property
|
||||
def tuple(self):
|
||||
try:
|
||||
#try:
|
||||
if self.data:
|
||||
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
|
||||
except:
|
||||
import pdb
|
||||
pdb.set_trace()
|
||||
return ()
|
||||
#except:
|
||||
# import pdb
|
||||
# pdb.set_trace()
|
||||
|
||||
|
|
|
@ -14,15 +14,12 @@ args = parser.parse_args()
|
|||
|
||||
def generate_imports():
|
||||
return "\n".join([
|
||||
"from pyaccuwage import model",
|
||||
"from pyaccuwage import model as pyaccuwagemodel",
|
||||
"from pyaccuwage.fields import *",
|
||||
"",
|
||||
"",
|
||||
])
|
||||
|
||||
def generate_class_begin(name):
|
||||
return "class %s(mode.Model):\n" % name
|
||||
|
||||
if args.full:
|
||||
sys.stdout.write(generate_imports())
|
||||
|
||||
|
@ -67,7 +64,9 @@ for rec in records:
|
|||
|
||||
#print last_record_ends_at + 1, begins_at
|
||||
if last_record_ends_at + 1 != begins_at:
|
||||
sys.stdout.write("\nclass %s(object):\n" % re.sub('[^\w]','',rec[0].split(':')[-1]))
|
||||
name = re.sub('^[^a-zA-Z]*','',rec[0].split(':')[-1])
|
||||
name = re.sub('[^\w]*', '', name)
|
||||
sys.stdout.write("\nclass %s(pyaccuwagemodel.Model):\n" % name)
|
||||
|
||||
for field in builder.load(map(lambda x:x.tuple, rec[1][0:])):
|
||||
sys.stdout.write('\t' + field + '\n')
|
||||
|
|
2
setup.py
2
setup.py
|
@ -1,6 +1,6 @@
|
|||
from distutils.core import setup
|
||||
setup(name='pyaccuwage',
|
||||
version='0.2012.0',
|
||||
version='0.2012.1',
|
||||
packages=['pyaccuwage'],
|
||||
scripts=['scripts/pyaccuwage-parse', 'scripts/pyaccuwage-pdfparse'],
|
||||
zip_safe=True,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue