working better!

This commit is contained in:
Binh 2013-02-05 15:43:04 -06:00
parent e6e087ef38
commit 730073dcd1

View file

@ -108,45 +108,29 @@ class PDFRecordFinder(object):
blank_row_counter = 0 blank_row_counter = 0
for r in row_iter: for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8')) row = r.decode('UTF-8')
if not row: row_columns = self.extract_columns_from_row(row)
cc.empty_row()
if not row_columns:
if cc.data and len(cc.data.keys()) > 1 and len(row.strip()) > cc.data.keys()[-1]:
yield cc
cc = ColumnCollector()
else:
cc.empty_row()
continue continue
try: try:
cc.add(row) cc.add(row_columns)
except IsNextField, e: except IsNextField, e:
yield cc yield cc
cc = ColumnCollector() cc = ColumnCollector()
cc.add(row) cc.add(row_columns)
except UnknownColumn, e: except UnknownColumn, e:
raise StopIteration raise StopIteration
yield cc yield cc
def find_fields_old(self, row_iter):
cc = ColumnCollector()
for r in row_iter:
row = self.extract_columns_from_row(r.decode('UTF-8'))
if not row:
continue
if cc.is_next_field(row):
#if row[1][1] == 'Vendor Indicator':
# import pdb
# pdb.set_trace()
yield cc
cc = ColumnCollector()
try:
cc.add(row)
except UnknownColumn, e:
raise StopIteration
yield cc
def extract_columns_from_row(self, row): def extract_columns_from_row(self, row):
re_multiwhite = re.compile(r'\s{2,}') re_multiwhite = re.compile(r'\s{2,}')
@ -246,7 +230,6 @@ class ColumnCollector(object):
def adjust_columns(self, data): def adjust_columns(self, data):
adjusted_data = {} adjusted_data = {}
for col_id, value in data: for col_id, value in data:
if col_id in self.data.keys(): if col_id in self.data.keys():
adjusted_data[col_id] = value.strip() adjusted_data[col_id] = value.strip()
@ -315,9 +298,11 @@ class ColumnCollector(object):
@property @property
def tuple(self): def tuple(self):
try: #try:
if self.data:
return tuple(map(lambda k:self.data[k], sorted(self.data.keys()))) return tuple(map(lambda k:self.data[k], sorted(self.data.keys())))
except: return ()
import pdb #except:
pdb.set_trace() # import pdb
# pdb.set_trace()