From fecd14db599c5a8231f9c4b5d7342cc82f6941d3 Mon Sep 17 00:00:00 2001
From: Binh Nguyen <binh37@gmail.com>
Date: Tue, 19 Jun 2012 15:37:17 -0500
Subject: [PATCH] adding pdfextract for column extraction

---
 pyaccuwage/pdfextract.py | 69 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 pyaccuwage/pdfextract.py

diff --git a/pyaccuwage/pdfextract.py b/pyaccuwage/pdfextract.py
new file mode 100644
index 0000000..365d322
--- /dev/null
+++ b/pyaccuwage/pdfextract.py
@@ -0,0 +1,69 @@
+#!/usr/bin/python
+# coding=UTF-8
+
+import subprocess
+import re
+import pdb
+
+""" pdftotext -layout -nopgbrk p1220.pdf - """
+
+class PDFRecordFinder(object):
+    def __init__(self, src, heading_exp=None):
+        if not heading_exp:
+            heading_exp = re.compile('\s+Record Name: (.*)')
+
+        opts = ["pdftotext", "-layout", "-nopgbrk", src, '-']
+        pdftext = subprocess.check_output(opts)
+        self.textrows = pdftext.split('\n')
+        self.heading_exp = heading_exp
+
+    @property 
+    def rows(self):
+        for row in self.textrows:
+            yield row
+
+    @property
+    def records(self):
+        row_iter = self.rows
+        for r in row_iter:
+            if self.heading_exp.match(r):
+                record = self.extract_record(row_iter)
+                yield record
+
+    def extract_record(self, row_iter):
+        re_multiwhite = re.compile(r'\s{2,}')
+        result = []
+        full_width_text_count = 0
+
+        for r in row_iter:
+            row = r.decode('UTF-8')
+            if not row:
+                continue
+
+            # IF LINE CONTAINS MULTIPLE WHITESPACE, IT IS PROBABLY A TABLE
+            if not re_multiwhite.search(row):
+                full_width_text_count += 1
+                #if full_width_text_count > 2:
+                #    print 'full width text count exceeded limit'
+                #    return result
+                continue
+            
+            #pdb.set_trace()
+            row_result = []
+            pos = 0
+            while pos < len(row):
+                match = re_multiwhite.search(row[pos:])
+                if match and match.start() > 0:
+                    row_result.append((
+                        pos,
+                        row[pos:pos+match.start()],))
+                    pos += match.end()
+                else:
+                    if match:
+                        pos += match.end()
+                    row_result.append((pos,row[pos:],'b'))
+                    pos += len(row)
+
+            result.append(row_result)
+            row_result = []
+        return result