working better!

This commit is contained in:
Binh 2013-02-05 15:43:04 -06:00
parent e6e087ef38
commit 730073dcd1

View file

@ -108,45 +108,29 @@ class PDFRecordFinder(object):
blank_row_counter = 0
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row:
cc.empty_row()
row = r.decode('UTF-8')
row_columns = self.extract_columns_from_row(row)
if not row_columns:
if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]:
yield cc
cc = ColumnCollector()
else:
cc.empty_row()
continue
try:
cc.add(row)
cc.add(row_columns)
except IsNextField, e:
yield cc
cc = ColumnCollector()
cc.add(row)
cc.add(row_columns)
except UnknownColumn, e:
raise StopIteration
yield cc
def find_fields_old(self, row_iter):
cc = ColumnCollector()
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row:
continue
if cc.is_next_field(row):
#if row[1][1] == 'Vendor Indicator':
# import pdb
# pdb.set_trace()
yield cc
cc = ColumnCollector()
try:
cc.add(row)
except UnknownColumn, e:
raise StopIteration
yield cc
def extract_columns_from_row(self, row):
re_multiwhite = re.compile(r'\s{2,}')
@ -246,7 +230,6 @@ class ColumnCollector(object):
def adjust_columns(self, data):
adjusted_data = {}
for col_id, value in data:
if col_id in self.data.keys():
adjusted_data[col_id] = value.strip()
@ -315,9 +298,11 @@ class ColumnCollector(object):
@property
def tuple(self):
try:
#try:
if self.data:
return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
except:
import pdb
pdb.set_trace()
return ()
#except:
# import pdb
# pdb.set_trace()