From fecd14db599c5a8231f9c4b5d7342cc82f6941d3 Mon Sep 17 00:00:00 2001 From: Binh Nguyen Date: Tue, 19 Jun 2012 15:37:17 -0500 Subject: [PATCH] adding pdfextract for column extraction --- pyaccuwage/pdfextract.py | 69 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 pyaccuwage/pdfextract.py diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py new file mode 100644 index 0000000..365d322 --- /dev/null +++ b/pyaccuwage/pdfextract.py @@ -0,0 +1,69 @@ +#!/usr/bin/python +# coding=UTF-8 + +import subprocess +import re +import pdb + +""" pdftotext -layout -nopgbrk p1220.pdf - """ + +class PDFRecordFinder(object): + def __init__(self, src, heading_exp=None): + if not heading_exp: + heading_exp = re.compile('\s+Record Name: (.*)') + + opts = ["pdftotext", "-layout", "-nopgbrk", src, '-'] + pdftext = subprocess.check_output(opts) + self.textrows = pdftext.split('\n') + self.heading_exp = heading_exp + + @property + def rows(self): + for row in self.textrows: + yield row + + @property + def records(self): + row_iter = self.rows + for r in row_iter: + if self.heading_exp.match(r): + record = self.extract_record(row_iter) + yield record + + def extract_record(self, row_iter): + re_multiwhite = re.compile(r'\s{2,}') + result = [] + full_width_text_count = 0 + + for r in row_iter: + row = r.decode('UTF-8') + if not row: + continue + + # IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE + if not re_multiwhite.search(row): + full_width_text_count += 1 + #if full_width_text_count > 2: + # print 'full width text count exceeded limit' + # return result + continue + + #pdb.set_trace() + row_result = [] + pos = 0 + while pos < len(row): + match = re_multiwhite.search(row[pos:]) + if match and match.start() > 0: + row_result.append(( + pos, + row[pos:pos+match.start()],)) + pos += match.end() + else: + if match: + pos += match.end() + row_result.append((pos,row[pos:],'b')) + pos += len(row) + + result.append(row_result) + row_result = [] + return result