| | import os |
| | from io import BytesIO |
| | from tqdm import tqdm |
| | import numpy as np |
| | from typing import Callable, Dict, List |
| | from PIL import Image as PIL_Image |
| | from PIL.Image import Image |
| |
|
| | from datasets import logging |
| |
|
| | logger = logging.get_logger(__name__) |
| | import PyPDF2 |
| |
|
| | MAX_PAGES = 50 |
| | MAX_PDF_SIZE = 100000000 |
| | MIN_WIDTH, MIN_HEIGHT = 150, 150 |
| | import pdf2image |
| |
|
| |
|
| | def pdf2image_image_extraction(pdf_stream): |
| | try: |
| | images: List[Image] = pdf2image.convert_from_bytes(pdf_stream) |
| | return images |
| | except Exception as e: |
| | logger.warning(f"{e}") |
| |
|
| |
|
| | def pdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): |
| | example["pages"] = 0 |
| | example["pixel_values"] = None |
| | pixel_values = [] |
| | if len(example["file"]) > MAX_PDF_SIZE: |
| | logger.warning(f"too large file {len(example['file'])}") |
| | return example |
| | try: |
| | reader = PyPDF2.PdfReader(BytesIO(example["file"])) |
| | except Exception as e: |
| | logger.warning(f"read_pdf {e}") |
| | return example |
| | example["pages"] = len(reader.pages) |
| | reached_page_limit = False |
| | if "sample" in inference_method.scope and inference_method.scope != "sample-grid": |
| | page_iterator = [inference_method.get_page_scope(reader.pages)] |
| | else: |
| | page_iterator = reader.pages |
| |
|
| | try: |
| | for p, page in enumerate(page_iterator): |
| | if reached_page_limit: |
| | break |
| | for image in page.images: |
| | if len(pixel_values) == MAX_PAGES: |
| | reached_page_limit = True |
| | break |
| | im = PIL_Image.open(BytesIO(image.data)) |
| | if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
| | continue |
| | |
| | |
| | |
| | if inference_method.scope != "sample-grid": |
| | im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
| | pixel_values.append(im) |
| | except Exception as e: |
| | print(f"{example.get('id')} PyPDF get_images {e}") |
| | pixel_values = [] |
| |
|
| | if len(pixel_values) == 0: |
| | |
| | try: |
| | images = pdf2image_image_extraction(example["file"]) |
| | except Exception as e: |
| | print(f"{example.get('id')} pdf2image get_images {e}") |
| | images = [] |
| |
|
| | if not images: |
| | print(f"{example.get('id')} pdf2image has no images") |
| | example["pages"] = 0 |
| | return example |
| |
|
| | |
| | example["pages"] = len(images) |
| | for im in images: |
| | if len(pixel_values) == MAX_PAGES: |
| | reached_page_limit = True |
| | break |
| | if im.width < MIN_WIDTH and im.height < MIN_HEIGHT: |
| | continue |
| | if inference_method.scope != "sample-grid": |
| | im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
| | pixel_values.append(im) |
| |
|
| | if inference_method.scope == "sample-grid": |
| | grid = inference_method.get_page_scope(pixel_values) |
| | pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] |
| | elif "sample" in inference_method.scope: |
| | pixel_values = pixel_values[0] |
| | example["pixel_values"] = np.array(pixel_values) |
| | return example |
| |
|
| |
|
| | def nativepdf_to_pixelvalues_extractor(example, feature_extractor, inference_method): |
| | IMPOSSIBLE = ["6483941-Letter-to-John-Campbell.pdf", "7276809-Ocoee-Newspaper-Pages.pdf"] |
| | example["pages"] = 0 |
| | example["pixel_values"] = None |
| | pixel_values = [] |
| | if len(example["file"]) > MAX_PDF_SIZE: |
| | logger.warning(f"too large file {len(example['file'])}") |
| | return example |
| |
|
| | |
| | try: |
| | images = pdf2image_image_extraction(example["file"]) |
| | except Exception as e: |
| | print(f"{example.get('id')} pdf2image get_images {e}") |
| | images = [] |
| |
|
| | if not images: |
| | print(f"{example.get('id')} pdf2image has no images") |
| | example["pages"] = 0 |
| | return example |
| |
|
| | |
| | images = [im for im in images if im.width >= MIN_WIDTH and im.height >= MIN_HEIGHT] |
| |
|
| | if not images or (example.get("id") in IMPOSSIBLE and inference_method.scope == "sample-grid"): |
| | print(f"{example.get('id')} pdf2image has no images") |
| | example["pages"] = 0 |
| | return example |
| |
|
| | example["pages"] = len(images) |
| | reached_page_limit = False |
| | if "sample" in inference_method.scope and inference_method.scope != "sample-grid": |
| | page_iterator = [inference_method.get_page_scope(images)] |
| | else: |
| | page_iterator = images |
| |
|
| | for im in page_iterator: |
| | if len(pixel_values) == MAX_PAGES: |
| | reached_page_limit = True |
| | break |
| | if inference_method.scope != "sample-grid": |
| | im = feature_extractor([im.convert("RGB")])["pixel_values"][0] |
| | pixel_values.append(im) |
| |
|
| | if len(pixel_values) == 0: |
| | print(f"{example.get('id')} pdf2image has no valid images") |
| | example["pages"] = 0 |
| | return example |
| |
|
| | if inference_method.scope == "sample-grid": |
| | grid = inference_method.get_page_scope(pixel_values) |
| | pixel_values = feature_extractor([grid.convert("RGB")])["pixel_values"][0] |
| | elif "sample" in inference_method.scope: |
| | pixel_values = pixel_values[0] |
| | example["pixel_values"] = np.array(pixel_values) |
| | return example |
| |
|