Module table_transformer
Usage
from table_transformer import TableTransformer
Function
Restructures data from a directory of json files or dict of SectionExtractor objects to group the data by section and table title. Provides a query by key interface for use in data analysis and data summarization. See TableTransformer class documentation for additional info.
Classes
class Section (section_name: str, section_transforms: sp.SectionTransformSpec)
-
Usage
Internal to table_transformer. See TableTransformer docstring for additional information.
Methods
def add_key(self, table_name: str, key_name: str, check_key_name: str, value_map: dict[str, list[str]])
-
Add a new key (aka column) to a table based on values found in an existing column.
Args
table_name
:str
- table name
key_name
:str
- Name of new key
check_key_name
:str
- Name of existing key to check
value_map
:dict[str, list[str]]
- Keys correspond to new
column value if check_key_name column contains any string in value list
Returns
str
- message indicating success or failure for logging
def add_raw_table(self, source_file: str, table_name: str, rows: list[dict[str, str]])
-
add new Table from source_file to tables[table_name] list
def add_table(self, table: Table)
-
append existing Table to tables[table.title]; used by transform funcs when moving a table
def add_table_name(self, table_name: str)
-
add tables entry for supplied name if not already present
def add_to_section_summary(self, table_name: str, as_dict=False, use_section_name=False, suffix='')
-
add table to section summary and drop
Args
table_name
:str
- table name
as_dict
:bool
- Optional. If true, add all table contents
- to a single column in summary. Defaults to False.
use_section_name
:bool
- Optional. Use section name as new
- column name. Defaults to False.
suffix
:str
- Optional. Appended to column name. Defaults to "".
Returns
str
- message indicating success or failure for logging
def apply_transforms(self, table_names='')
-
See TableTransformer.apply_transforms
def combine_columns(self, table_name: str, columns_to_combine: Sequence[str], separator: str = ' ', new_column_name: str | None = None, strict: bool = False)
-
Combine two or more columns into a single column, removing the original columns. Typically used to combine separate "Date" and "Time" columns into a single DateTime column.
Args
table_name
:str
- table name
columns_to_combine
:Sequence[str]
- Sequence containing list of columns to be combined
separator
:str
- Optional. Used with "join" to concatenate list of all old column values into new value. Defaults to " ".
new_column_name
:str
- Optional. If supplied, used as the name for the newly created column. Otherwise, the new column name is calculated as separator.join(columns_to_combine). Defaults to None.
Returns
str
- message indicating success or failure for logging
def combine_table_by_source(self, table_name: str)
-
Combine all rows for all table instances into the first table instance, for each source and drop all but the first table.
Usage
ensures there's only one output in pdf_presummary per summary spec query. see 'QCDR' in transform_specs. if two postprocedure notes are included in the pdf, two QCDR tables are added for that source causing issues with duplicate case note lines / custom field entries.
def condense_tables(self, table_name, condense_spec: sp.CondenseSpec, suffix='', debug=False)
-
Transform function for condensing an overly verbose table into a terse version containing only values of interest.
Args
table_name
:str
- table name
condense_spec
:sp.CondenseSpec
- dictionary containing specifications for converting existing fields into desired output
suffix
:str
- Optional. Appended to all keys in output dict. Defaults to "".
debug
:bool
- Optional. Log debug data. Defaults to False.
Returns
str
- message indicating success or failure for logging
def convert_to_bool(self, table_name: str, convert_key: str, check_key: str, convert_func: Callable[[str, str], bool] = <built-in function eq>)
-
convert a key's value to a boolean value (formatted as a string) by comparing its value with the value of another key
def drop_keys(self, table_name: str, keys: list[str], sep='.')
-
Drop all keys (aka columns) contained in
keys
from a table.Args
table_name
:str
- table name
keys
:list[str]
- List of columns/keys to drop
sep
:str
- Optional. Used for chaining column names. Defaults to ".".
Returns
str
- message indicating success or failure for logging
def drop_rows(self, table_name: str, row_checks: list[Callable[[dict[str, str]], bool]], drop_when: bool = True)
-
drop all rows where row_check(row) == drop_when for the specified table and row_checks.
Args
table_name
:str
- table name
row_checks
:list[Callable[[dict[str, str]], bool]]
- List of row_check functions. if any row_check(row) == drop_when, row is eliminated
drop_when
:bool
- defaults to True
Returns
str
- message indicating success or failure for logging
def drop_table(self, table_name: str)
-
Self explanatory
def split_key(self, table_name: str, check_key: str, new_keys: list[str], split_func: Callable[..., NamedTuple | SimpleNamespace], combine=False, debug=False)
-
split_func should return an object with attributes that match the values in new_keys, typically a namedtuple. E.g. for: new_keys = ['street1', 'street2', 'city', 'state', 'zip'] split_func returns a namedtuple AddressTuple with field names corresponding to all values in new_keys
def standardize_columns(self, table_name: str, default: str = '--', debug=False)
-
standardize the columns in all table instances by source
Args
table_name
:str
- table name
default (str, ""): value to use when adding a missing key
Returns
str
- message indicating success or failure for logging
def table_for_source(self, tab_name: str, source: str)
-
manual use. get Table tab_name for source
def tables_for_source(self, source: str)
-
returns a list of all tables for supplied source. used during summary operations.
def update_keys(self, table_name: str, key_tuples: list[tu.KeyTuple])
-
Renames keys based on a list of tuples corresponding to (old key, new key)
Args
table_name
:str
- table name
key_tuples
:list[tuple[str, str]]
- list of tuples of string
of form (old key name, new key name)
Returns
str
- message indicating success or failure for logging
def value_pivot(self, table_name: str, key_column: str, pivot_columns: Sequence[str], key_column_split: str | None = None, overflow_column: str = 'overflow', sep: str = '.')
-
Classic spreadsheet "pivot" function for transposing table values into columns.
Args
table_name
:str
- table name
key_column
:str
- name of column containing values to convert
key_column_split
:str
- character at which value should be split to produce new column name. Typically set to newline.
overflow_column
:str
- New or existing column to house data ocurring after first instance of key_column_split.
pivot_columns
:Sequence[str]
- Columns which will become new values in pivoted table.
sep
:str
- Optional. Separator for chaining column names. Defaults to ".".
Returns
str
- message indicating success or failure for logging
class Table (source_file: str, section_name: str, table_name: str, rows: list[dict[str, str | vStr]])
-
Usage
Internal to table_transformer. See TableTransformer docstring for additional information.
Methods
def cnt(self)
-
return len(self.rows)
def query_by_key(self, key_query: str, drop_keys_like: Sequence[str] | None = None)
-
given regex string "key_query" return a nested list of values from all matched columns
def select_column(self, column: str) ‑> list[str | vStr]
-
returns a list of values for a column
def update_names(self, section_name: str, table_name: str, sep='.')
-
update names in Table object
Args
section_name
:str
- new section name
table_name
:str
- new table name
sep
:str
- Optional. for column name chaining. Defaults to ".".
class TableTransformer (extracted_data, transform_spec: sp.TransformSpec, summary_spec: sp.SummarySpec, auto=True, **kwargs)
-
Usage
tab_transformer = TableTransformer('./mydir/*.json', 'Epic') OR tab_transformer = TableTransformer(section_dict, 'Epic') where section_dict is of the form returned by section_extractor.section_extractor_factory().
Init Args: extracted_data: dict or directory containing json files summary_spec: defines the output of the final call to the summarize_section transform function. obtain from specs.get_summary_spec(). transform_spec: defines the transforms to be applied to the data. obtain from specs.get_transform_spec(). auto: If true, perform all section and table transformations defined in the transform_specs and generate a summary df during TableTransformer init. True is the default as the typical use case is to process and transform the data without additional user intervention. If false, exit the init routine after loading the data but before any transformations are applied. Allows the user to manually apply transforms, examine data at multiple stages of the transformation process, analyze data to inform the creation of new transform specs, and troubleshoot problems.
Function
read section and table data and group by section and table titles for further processing. E.g. incoming data: { 'Patient1': { 'Section1.Table1': [ {'field1': 'value1', …}, … ], 'Section1.Table2': [ {'field1': 'value1', …}, … ], 'Section2.Table1': [ {'field1': 'value1', …}, … ], }, 'Patient2': { 'Section1.Table1': [ {'field1': 'value1', …}, … ], 'Section1.Table2': [ {'field1': 'value1', …}, … ], 'Section2.Table1': [ {'field1': 'value1', …}, … ], }, … } will be restructured into: TableTransformer.sections = { 'Section1': Section( | —>.tables = { 'Table1': [ Table( | —>.rows = [{'field1': 'value1', …}, …] —>.source = 'Patient1'), Table( | —>.rows = [{'field1': 'value1', …}, …] —>.source = 'Patient2'), … ], 'Table2': [ Table( | —>.rows = [{'field1': 'value1', …}, …] —>.source = 'Patient1'), Table( | —>.rows = [{'field1': 'value1', …}, …] —>.source = 'Patient2'), … ], … } ), 'Section2': Section( | —>.tables = { 'Table1': [ Table( | —>.rows = [{'field1': 'value1', …}, …] —>.source = 'Patient1'), Table( | —>.rows = [{'field1': 'value1', …}, …] —>.source = 'Patient2'), … ], … } ), } where Section and Table are instances of the classes defined below. Note that this restructuring is considered pre-processing and will occur during init regardless of the value passed to the auto param.
read in all .json files in source directory or consume output of section_extractor.section_extractor_factory().
Methods
def add_section(self, section_name: str)
-
Creates a new instance of the Section class and adds it to self.sections dict.
def apply_transforms(self, section_names=None)
-
Usage
Internal if auto=True is passed to TableTransformer constructor OR table_xformer.apply_transforms(Optional list of sections)
Function
Generates an ordered list of function calls according to self.transform_spec and uses getattr to find and call the correct function reference. Three distinct "transform" operations are performed: "pre" section transforms: Transforms applied at the section level prior to applying transforms at the table level. specified by transform_specs['Section Name']['section_transforms']['pre']. transforms defined in the "pre" section of transform_specs['default_specs']['section_transforms']['pre'] will also be called for ALL sections. table transforms: Transfroms applied at the table level by the "apply_transforms" function of the Section class. "post" section transforms: Transforms applied at the section level after applying transforms at the table level. specified in the same way as the 'pre' transforms but by a 'post' key rather than 'pre', default_specs included.
def as_nested_dict(self, **kwargs) ‑> dict[str, dict[str, typing.Any]]
-
Convert self._deduped_summary into a nested object such that jmespath queries for keys in self._deduped_summary return the original value from self._dedupled_summary.
Usage
assign "as_nested_dict" to the "summary_func" key in a summary_spec
KwArgs
dedup_lists
:bool
- an exception to the NOTE above. managed internally.
explode_specs
:list[ExplodeSpec]
- list of ExplodeSpec. converts nested objects into arrays. each item in the nest is converted to an object of form {"key": "original_item_key", "value": "original_item_value"}
NOTE: all kwargs should be passed via entries in the "summary_args" key of a summary_spec
Returns
dict[str, dict[str, Any]]
- nested dictionary of the form {patient_designator: {nested_object}, …}
def as_summary_dict(self)
-
used by default_summary_spec to return summary_dict as output
def copy_keys(self, section_name: str, dest_sect_name: str, src_tab_name: str, dest_tab_name: str, key_tuples: list[tu.KeyTuple], **kwargs) ‑> str
-
Usage
Internal kwargs: concat, multi_row, create_dest_tab, create_dest_sect (all bool) Additional Info: transform function applied at the section level via TableTransformer.apply_transforms() function. Calls are driven via entries in transform_specs['Section Name']['section_transforms']
Function
copy a key value pair from the first row of a table in one section to all rows of a table in a different section
def dedup_pdf_summary(self, list_sep: str = '|') ‑> dict[str, dict[str, str]]
-
dedup entries in self.summary_dict
def drop_tables(self, section_name, table_names, **kwargs)
-
Usage
Internal Additional Info: transform function applied at the section level via TableTransformer.apply_transforms() function. Calls are driven via entries in transform_specs['Section Name']['section_transforms']
Function
Drop (delete) all instances of a Table class from a Section.
def flattened_dump(self, sep='|')
-
A flattened dict of all key/value pairs from all tables for each source
def generate_tables_by_source(self) ‑> collections.abc.Iterator[tuple[str, list[Table]]]
-
Iterator for producing tuples of form (source, list of tables)
Yields
tuple[str, list[Table]]
- Each yielded value represents the comprehensive list of tables extracted for source.
def merge_with_output(self, flat_data: dict[str, dict[str, Any]] | None = None, nested_data: dict[str, dict[str, Any]] | None = None) ‑> dict[str, bool]
-
merge data from schedule/demo csv (flat_data) with data from database (nested_data) and data extracted from PDFs (self.output).
merges utilize the case_matcher class defined in matchops.standard_matchers.
Args
flat_data
:dict[str, dict[str, Any]]
- data from schedule
nested_data
:dict[str, dict[str, Any]]
- data from database
Returns
dict[str, bool]
- dict of case_designator: send_result pairs
def move_tables(self, section_name: str, dest_sect_name: str, src_tab_name: str, dest_tab_name: str, **kwargs) ‑> str
-
Usage
Internal Additional Info: transform function applied at the section level via TableTransformer.apply_transforms() function. Calls are driven via entries in transform_specs['Section Name']['section_transforms']
Function
Move a Table class from one Section to another (renaming the class if desired), or rename a Table class in place
def section_table_split(self, sect_and_tab: str)
-
Keys are expected to have form "Section Name.Table Name[.#]" where [.#] is an optional instance id to avoid duplicate keys. Log error if number of remaining fields != 2 after removing numerics.
def strip_prefix(self, section_name, table_names, sep='.')
-
Usage
Internal Additional Info: transform function applied at the section level via TableTransformer.apply_transforms() function. Calls are driven via entries in transform_specs['Section Name']['section_transforms']
Function
Removes prefixes from all keys in a Table class, e.g. "Pre Evaluation.BP" –> "BP"
def summarize_section(self, section_name: str, summary_spec: sp.SummarySpec = None, sep='|', debug=False)
-
Usage
Internal Additional Info: transform function applied at the section level via TableTransformer.apply_transforms() function. Calls are driven via entries in transform_specs['Section Name']['section_transforms']
Function
Applied after all section and table transforms have been executed to generate self.summary_dict, which contains summary information by patient and is used to generate primary output.
def summary_func_to_output(self)
-
Create output by calling "summary_func" defined in summary_specs
def trans_specs_for_sect(self, section_name: str) ‑> dict[str, dict[str, dict[str, typing.Any]]]
-
gets section specific transform specs to be applied to tables within a section. Passed to the Section constructor.