| """Tabular parser. |
| |
| Contains parsers for tabular data files. |
| |
| """ |
| from pathlib import Path |
| from typing import Any, Dict, List, Union |
|
|
| from application.parser.file.base_parser import BaseParser |
|
|
|
|
| class CSVParser(BaseParser): |
| """CSV parser. |
| |
| Args: |
| concat_rows (bool): whether to concatenate all rows into one document. |
| If set to False, a Document will be created for each row. |
| True by default. |
| |
| """ |
|
|
| def __init__(self, *args: Any, concat_rows: bool = True, **kwargs: Any) -> None: |
| """Init params.""" |
| super().__init__(*args, **kwargs) |
| self._concat_rows = concat_rows |
|
|
| def _init_parser(self) -> Dict: |
| """Init parser.""" |
| return {} |
|
|
| def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: |
| """Parse file. |
| |
| Returns: |
| Union[str, List[str]]: a string or a List of strings. |
| |
| """ |
| try: |
| import csv |
| except ImportError: |
| raise ValueError("csv module is required to read CSV files.") |
| text_list = [] |
| with open(file, "r") as fp: |
| csv_reader = csv.reader(fp) |
| for row in csv_reader: |
| text_list.append(", ".join(row)) |
| if self._concat_rows: |
| return "\n".join(text_list) |
| else: |
| return text_list |
|
|
|
|
| class PandasCSVParser(BaseParser): |
| r"""Pandas-based CSV parser. |
| |
| Parses CSVs using the separator detection from Pandas `read_csv`function. |
| If special parameters are required, use the `pandas_config` dict. |
| |
| Args: |
| concat_rows (bool): whether to concatenate all rows into one document. |
| If set to False, a Document will be created for each row. |
| True by default. |
| |
| col_joiner (str): Separator to use for joining cols per row. |
| Set to ", " by default. |
| |
| row_joiner (str): Separator to use for joining each row. |
| Only used when `concat_rows=True`. |
| Set to "\n" by default. |
| |
| pandas_config (dict): Options for the `pandas.read_csv` function call. |
| Refer to https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html |
| for more information. |
| Set to empty dict by default, this means pandas will try to figure |
| out the separators, table head, etc. on its own. |
| |
| """ |
|
|
| def __init__( |
| self, |
| *args: Any, |
| concat_rows: bool = True, |
| col_joiner: str = ", ", |
| row_joiner: str = "\n", |
| pandas_config: dict = {}, |
| **kwargs: Any |
| ) -> None: |
| """Init params.""" |
| super().__init__(*args, **kwargs) |
| self._concat_rows = concat_rows |
| self._col_joiner = col_joiner |
| self._row_joiner = row_joiner |
| self._pandas_config = pandas_config |
|
|
| def _init_parser(self) -> Dict: |
| """Init parser.""" |
| return {} |
|
|
| def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]: |
| """Parse file.""" |
| try: |
| import pandas as pd |
| except ImportError: |
| raise ValueError("pandas module is required to read CSV files.") |
|
|
| df = pd.read_csv(file, **self._pandas_config) |
|
|
| text_list = df.apply( |
| lambda row: (self._col_joiner).join(row.astype(str).tolist()), axis=1 |
| ).tolist() |
|
|
| if self._concat_rows: |
| return (self._row_joiner).join(text_list) |
| else: |
| return text_list |
|
|