Autodidact and inspired by David Beazley code (to improve my skills in Python), I would like to get your feedback on this parser code.
The lazy property lets the code be computed only one time if used (https://github.com/dabeaz/python-cookbook/blob/master/src/8/lazily_computed_attributes/example1.py)
class lazyproperty:
def __init__(self, func):
self.func = func
def __get__(self, instance, cls):
if instance is None:
return self
else:
value = self.func(instance)
setattr(instance, self.func.__name__, value)
return value
class PDFParser():
"""
"""
def __init__(self,filepath,page_num=0):
self.filepath = filepath
try:
self._doc = fitz.open(filepath)
self.page_num = page_num
self._page = self._doc[page_num]
except Exception as e:
print("Lecture PDF impossible. {}".format(e))
@lazyproperty
def text(self):
return self._page.getText()
@lazyproperty
def _pixs(self):
imgs = self._doc.getPageImageList(self.page_num)
pixs =[]
for img in imgs:
xref = img[0]
pix = fitz.Pixmap(self._doc, xref)
pixs.append(pix)
return pixs
@lazyproperty
def _pixpage(self):
pix = self._page.getPixmap(colorspace=fitz.csGRAY)
return pix
@property
def img(self):
return self.imgs[0]
@lazyproperty
def imgs(self):
pixs = self._pixs
imgsarray = []
for pix in pixs:
img = self.pix2np(pix)
imgsarray.append(img)
return imgsarray
def write(self,outputdir,fullpage=False):
filename = os.path.basename(self.filepath).split('.pdf')[0]
def writePNG(pix,filepath):
# This is GRAY or RGB
try:
pix.writePNG(filepath)
# CMYK: convert to RGB first
except:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.writePNG(filepath)
pix = None
if fullpage:
filepath = os.path.join(outputdir,'{}_p{}.png'.format(filename,self.page_num))
pix = self._pixpage
writePNG(pix,filepath)
return
pixs = self._pixs
for i,pix in enumerate(pixs):
filepath = os.path.join(outputdir,'{}_p{}_i{}.png'.format(filename,self.page_num,i))
writePNG(pix,filepath)
return
@staticmethod
def pix2np(pix):
"""
Convert pixmap to image np.ndarray
https://stackoverflow.com/questions/53059007/python-opencv
param pix: pixmap
"""
import numpy as np
#https://stackoverflow.com/questions/22236749/numpy-what-is-the-difference-between-frombuffer-and-fromstring
im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
try:
im = np.ascontiguousarray(im[..., [2, 1, 0]]) # rgb to bgr
except IndexError:
#Trick to convert Gray rto BGR, (im.reshape)
logger.warning("Shape of image array is {}".format(im.shape))
im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
im = np.ascontiguousarray(im[..., [2, 1, 0]])
return im