Module integrators.docuvision_utils
Utility and helper functions for docuvision_integrator.py
Functions
def blank_response(*, task_dict: dict[str, typing.Any], error_message: str, **kwargs) ‑> GetTaskResponse
-
get an empty template for a response with the minimum fields needed to avoid lookup errors when compiling results. also include diagnostic info in the final output if required/available.
def check_cascaded_pids(txt_pages: list[str], response: Any)
-
** MODIFIES RESPONSE IN PLACE **** updates pid value and found status for pages with cascaded PIDs.
Builds a list of names discovered in results for the found PIDs immediately before and after each set of contiguous pages without a found PID. Tests both lists of names against each line of OCR text in each page. The PID whose names produce the best sequence match ratio is selected for the page if the margin between the non-selected best match and selected best match is greater than 0.2. Otherwise, the PID is set to unknown to allow a user to sort out the issue.
def format_dotmap(dotmap: str) ‑> str
-
format dotmap
def format_value(value: str, dotmap: str)
-
Apply standard formatting for specific value types or return value if supplied value or label does not constitute special handling.
def valid_dos(value: str | None) ‑> bool
-
False if supplied date str is future or past by more than 30 days
def validate_confidence(confidence: float, value: str, dotmap: str) ‑> float
-
Check validity of extracted value and set confidence to 0.0 if value is invalid (i.e. a "date" field whose value is not a valid date should have confidence = 0.0)
Classes
class GetTaskResponse (id: int = 0, name: str = 'Error', error: str | None = None, state: str = 'Error', result: GetTaskResult = <factory>, status: str | None = None, service: str = 'docuvision-1', metadata: dict[str, typing.Any] = <factory>, statistics: dict[str, typing.Any] = <factory>, processedDocument: dict[str, typing.Any] = <factory>, default_dos: str = '2024-08-27', force_default_dos: dataclasses.InitVar[bool] = False)
-
dataclass representation of a docuvision response
Class variables
var default_dos : str
var error : str | None
var force_default_dos : dataclasses.InitVar[bool]
var id : int
var metadata : dict[str, typing.Any]
var name : str
var processedDocument : dict[str, typing.Any]
var result : GetTaskResult
var service : str
var state : str
var statistics : dict[str, typing.Any]
var status : str | None
class GetTaskResult (PIDS: list[str] = <factory>, PAGES: list[dict[str, typing.Any]] = <factory>, RESULT: list[dict[str, typing.Any]] = <factory>, METADATA: dict[str, typing.Any] = <factory>)
-
dataclass representation of a docuvision response
Class variables
var METADATA : dict[str, typing.Any]
var PAGES : list[dict[str, typing.Any]]
var PIDS : list[str]
var RESULT : list[dict[str, typing.Any]]
class PidTestCase (before_pid: str, unknown: list[list[str]], after_pid: str, start_offset: int)
-
tuples of (
previous
,concatenated unknown page text
,next
)Ancestors
- builtins.tuple
Instance variables
var after_pid : str
-
Alias for field number 2
var before_pid : str
-
Alias for field number 0
var start_offset : int
-
Alias for field number 3
var unknown : list[list[str]]
-
Alias for field number 1