From b77b80e485031c92c7917da5190cef794cf68a95 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Sat, 30 Jun 2012 15:21:05 -0500 Subject: [PATCH] We need to remove some of the yield statements because it's making iteration very confusing to keep track of, due to global iterators being passed around and iterated over in chunks. I've added a located_heading_rows method which scans the entire document for row numbers that look like record definition headings. I think we can use these number spans to feed into the row columnizer stuff. --- pyaccuwage/pdfextract.py | 161 ++++++++++++++++++++++++++++----------- 1 file changed, 116 insertions(+), 45 deletions(-) diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index eb2a9f0..70e76b5 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -17,36 +17,23 @@ class PDFRecordFinder(object): pdftext = subprocess.check_output(opts) self.textrows = pdftext.split('\n') self.heading_exp = heading_exp - - """ - @property - def records(self): - row_iter = iter(self.textrows) - try: - while 1: - row = row_iter.next() - if self.heading_exp.match(row): - record = self.extract_record(row_iter) - yield record - except Exception, e: - raise e - #for r in self.textrows: - # if self.heading_exp.match(r): - # record = self.extract_record() - # yield record - """ - + """ def columns(self): results = [] cc = ColumnCollector() - for group in self.record_grouping(): + for heading, group in self.record_grouping(): + print "HEADING", heading for row in group: - if cc.is_next_record(row): + if cc.is_next_field(row): yield cc cc = ColumnCollector() - print row - cc.add(row) + #print row + try: + cc.add(row) + except UnknownColumn, e: + results.append(cc) + cc = ColumnCollector() @@ -55,31 +42,114 @@ class PDFRecordFinder(object): i = 0 for row in row_iter: i += 1 - if self.heading_exp.match(row): - yield self.extract_record_columns(row_iter) + match = self.heading_exp.match(row) + if match: + yield (match.groups(), self.extract_record_columns(row_iter)) + + """ - def extract_record_columns(self, row_iter): - re_multiwhite = re.compile(r'\s{2,}') - result = [] - full_width_text_count = 0 + def locate_heading_rows(self): + results = [] + for (i, row) in enumerate(self.textrows): + match = self.heading_exp.match(row) + if match: + if not ''.join(match.groups()).lower().endswith('(continued)'): + results.append((i, ''.join(match.groups()))) + return results + + def records2(self): + row_iter = iter(self.textrows) + record = True + while record: + record = self.extract_record(row_iter) + yield record + + + def extract_record(self, row_iter): + heading = self.find_heading(row_iter) + fields = self.find_fields(row_iter) + return heading, list(fields) + + + def find_heading(self, row_iter): + for row in row_iter: + heading_match = self.heading_exp.match(row) + if heading_match: + return heading_match.groups() + + + def find_fields(self, row_iter): + cc = ColumnCollector() for r in row_iter: - row = r.decode('UTF-8') + row = self.extract_columns_from_row(r) if not row: continue + + if cc.is_next_field(row): + yield cc + cc = ColumnCollector() + try: + cc.add(row) + except UnknownColumn, e: + print 'UNKNOWN COLUMN', row + raise StopIteration - #if row.strip().startswith('Code'): - # pdb.set_trace() + def extract_columns_from_row(self, row): + re_multiwhite = re.compile(r'\s{2,}') + + # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE + if not re_multiwhite.search(row): + return None + + white_ranges = [0,] + pos = 0 + while pos < len(row): + match = re_multiwhite.search(row[pos:]) + if match: + white_ranges.append(pos + match.start()) + white_ranges.append(pos + match.end()) + pos += match.end() + else: + white_ranges.append(len(row)) + pos = len(row) + + row_result = [] + white_iter = iter(white_ranges) + while white_iter: + try: + start = white_iter.next() + end = white_iter.next() + if start != end: + row_result.append( + (start, row[start:end]) + ) + + except StopIteration: + white_iter = None + + return row_result + + + + def extract_row_columns(self, row_iter): + re_multiwhite = re.compile(r'\s{2,}') + full_width_text_count = 0 + + #for r in row_iter: + row = None + while not row: + row = row_iter.next() + row = row.decode('UTF-8') + # IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE if not re_multiwhite.search(row): full_width_text_count += 1 - #if full_width_text_count > 2: - # print 'full width text count exceeded limit' - # return result - continue - + row = None + + if True: white_ranges = [0,] pos = 0 match = True @@ -107,9 +177,11 @@ class PDFRecordFinder(object): except StopIteration: white_iter = None - print white_ranges - yield row_result - result.append(row_result) + + #print white_ranges + return row_result + #yield row_result + #result.append(row_result) """ @@ -136,7 +208,7 @@ class PDFRecordFinder(object): class UnknownColumn(Exception): pass -class IsNextRecord(Exception): +class IsNextField(Exception): pass class ColumnCollector(object): @@ -148,8 +220,8 @@ class ColumnCollector(object): if not self.data: self.data = dict(data) else: - if self.is_next_record(data): - raise IsNextRecord() + if self.is_next_field(data): + raise IsNextField() for col_id, value in data: self.merge_column(col_id, value) @@ -161,7 +233,7 @@ class ColumnCollector(object): # try adding a wiggle room value? raise UnknownColumn - def is_next_record(self, data): + def is_next_field(self, data): """ If the first key value contains a string and we already have some data in the record, @@ -171,7 +243,6 @@ class ColumnCollector(object): """ first_key = dict(data).keys()[0] if self.data: - print self.data.keys()[0], first_key return self.data.keys()[0] == first_key return False