| """Docs parser. |
| |
| Contains parsers for docx, pdf files. |
| |
| """ |
| from pathlib import Path |
| from typing import Dict |
|
|
| from application.parser.file.base_parser import BaseParser |
|
|
|
|
| class PDFParser(BaseParser): |
| """PDF parser.""" |
|
|
| def _init_parser(self) -> Dict: |
| """Init parser.""" |
| return {} |
|
|
| def parse_file(self, file: Path, errors: str = "ignore") -> str: |
| """Parse file.""" |
| try: |
| import PyPDF2 |
| except ImportError: |
| raise ValueError("PyPDF2 is required to read PDF files.") |
| text_list = [] |
| with open(file, "rb") as fp: |
| |
| pdf = PyPDF2.PdfReader(fp) |
|
|
| |
| num_pages = len(pdf.pages) |
|
|
| |
| for page in range(num_pages): |
| |
| page_text = pdf.pages[page].extract_text() |
| text_list.append(page_text) |
| text = "\n".join(text_list) |
|
|
| return text |
|
|
|
|
| class DocxParser(BaseParser): |
| """Docx parser.""" |
|
|
| def _init_parser(self) -> Dict: |
| """Init parser.""" |
| return {} |
|
|
| def parse_file(self, file: Path, errors: str = "ignore") -> str: |
| """Parse file.""" |
| try: |
| import docx2txt |
| except ImportError: |
| raise ValueError("docx2txt is required to read Microsoft Word files.") |
|
|
| text = docx2txt.process(file) |
|
|
| return text |
|
|