diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py index eb2a9f0..70e76b5 100644 --- a/pyaccuwage/pdfextract.py +++ b/pyaccuwage/pdfextract.py @@ -17,36 +17,23 @@ class PDFRecordFinder(object): pdftext = subprocess.check_output(opts) self.textrows = pdftext.split('\n') self.heading_exp = heading_exp - - """ - @property - def records(self): - row_iter = iter(self.textrows) - try: - while 1: - row = row_iter.next() - if self.heading_exp.match(row): - record = self.extract_record(row_iter) - yield record - except Exception, e: - raise e - #for r in self.textrows: - # if self.heading_exp.match(r): - # record = self.extract_record() - # yield record - """ - + """ def columns(self): results = [] cc = ColumnCollector() - for group in self.record_grouping(): + for heading, group in self.record_grouping(): + print "HEADING", heading for row in group: - if cc.is_next_record(row): + if cc.is_next_field(row): yield cc cc = ColumnCollector() - print row - cc.add(row) + #print row + try: + cc.add(row) + except UnknownColumn, e: + results.append(cc) + cc = ColumnCollector() @@ -55,31 +42,114 @@ class PDFRecordFinder(object): i = 0 for row in row_iter: i += 1 - if self.heading_exp.match(row): - yield self.extract_record_columns(row_iter) + match = self.heading_exp.match(row) + if match: + yield (match.groups(), self.extract_record_columns(row_iter)) + + """ - def extract_record_columns(self, row_iter): - re_multiwhite = re.compile(r'\s{2,}') - result = [] - full_width_text_count = 0 + def locate_heading_rows(self): + results = [] + for (i, row) in enumerate(self.textrows): + match = self.heading_exp.match(row) + if match: + if not ''.join(match.groups()).lower().endswith('(continued)'): + results.append((i, ''.join(match.groups()))) + return results + + def records2(self): + row_iter = iter(self.textrows) + record = True + while record: + record = self.extract_record(row_iter) + yield record + + + def extract_record(self, row_iter): + heading = self.find_heading(row_iter) + fields = self.find_fields(row_iter) + return heading, list(fields) + + + def find_heading(self, row_iter): + for row in row_iter: + heading_match = self.heading_exp.match(row) + if heading_match: + return heading_match.groups() + + + def find_fields(self, row_iter): + cc = ColumnCollector() for r in row_iter: - row = r.decode('UTF-8') + row = self.extract_columns_from_row(r) if not row: continue + + if cc.is_next_field(row): + yield cc + cc = ColumnCollector() + try: + cc.add(row) + except UnknownColumn, e: + print 'UNKNOWN COLUMN', row + raise StopIteration - #if row.strip().startswith('Code'): - # pdb.set_trace() + def extract_columns_from_row(self, row): + re_multiwhite = re.compile(r'\s{2,}') + + # IF LINE DOESN'T CONTAIN MULTIPLE WHITESPACES, IT'S LIKELY NOT A TABLE + if not re_multiwhite.search(row): + return None + + white_ranges = [0,] + pos = 0 + while pos < len(row): + match = re_multiwhite.search(row[pos:]) + if match: + white_ranges.append(pos + match.start()) + white_ranges.append(pos + match.end()) + pos += match.end() + else: + white_ranges.append(len(row)) + pos = len(row) + + row_result = [] + white_iter = iter(white_ranges) + while white_iter: + try: + start = white_iter.next() + end = white_iter.next() + if start != end: + row_result.append( + (start, row[start:end]) + ) + + except StopIteration: + white_iter = None + + return row_result + + + + def extract_row_columns(self, row_iter): + re_multiwhite = re.compile(r'\s{2,}') + full_width_text_count = 0 + + #for r in row_iter: + row = None + while not row: + row = row_iter.next() + row = row.decode('UTF-8') + # IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE if not re_multiwhite.search(row): full_width_text_count += 1 - #if full_width_text_count > 2: - # print 'full width text count exceeded limit' - # return result - continue - + row = None + + if True: white_ranges = [0,] pos = 0 match = True @@ -107,9 +177,11 @@ class PDFRecordFinder(object): except StopIteration: white_iter = None - print white_ranges - yield row_result - result.append(row_result) + + #print white_ranges + return row_result + #yield row_result + #result.append(row_result) """ @@ -136,7 +208,7 @@ class PDFRecordFinder(object): class UnknownColumn(Exception): pass -class IsNextRecord(Exception): +class IsNextField(Exception): pass class ColumnCollector(object): @@ -148,8 +220,8 @@ class ColumnCollector(object): if not self.data: self.data = dict(data) else: - if self.is_next_record(data): - raise IsNextRecord() + if self.is_next_field(data): + raise IsNextField() for col_id, value in data: self.merge_column(col_id, value) @@ -161,7 +233,7 @@ class ColumnCollector(object): # try adding a wiggle room value? raise UnknownColumn - def is_next_record(self, data): + def is_next_field(self, data): """ If the first key value contains a string and we already have some data in the record, @@ -171,7 +243,6 @@ class ColumnCollector(object): """ first_key = dict(data).keys()[0] if self.data: - print self.data.keys()[0], first_key return self.data.keys()[0] == first_key return False