aces.predicates module¶
This module contains functions for generating predicate columns for event sequences.
- aces.predicates.direct_load_plain_predicates(data_path: Path, predicates: list[str], ts_format: str | None) DataFrame[source]¶
Loads a CSV file from disk and verifies that the necessary plain predicate columns are present.
- This CSV file must have the following columns:
subject_id: The subject identifier.
timestamp: The timestamp of the event, in the format “MM/DD/YYYY HH:MM”.
Any additional columns specified in the set of desired plain predicates.
Example
>>> CSV_data = pl.DataFrame({ ... "subject_id": [1, 1, 1, 1, 2, 2], ... "timestamp": [None, "01/01/2021 00:00", None, "01/01/2021 12:00", "01/02/2021 00:00", None], ... "is_admission": [0, 1, 0, 0, 1, 0], ... "is_discharge": [0, 0, 0, 1, 0, 0], ... "is_male": [1, 0, 0, 0, 0, 0], ... "is_female": [0, 0, 0, 0, 0, 1], ... "brown_eyes": [0, 0, 1, 0, 0, 0], ... }) >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... CSV_data.write_parquet(data_path) ... direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male", ... "is_female", "brown_eyes"], "%m/%d/%Y %H:%M") shape: (5, 7) ┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐ │ subject_id ┆ timestamp ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ └────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘If the timestamp column is already a timestamp, then the
ts_formatargument id not needed, but can be used without an error.>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... ( ... CSV_data ... .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M")) ... .write_parquet(data_path) ... ) ... direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male", ... "is_female", "brown_eyes"], "%m/%d/%Y %H:%M") shape: (5, 7) ┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐ │ subject_id ┆ timestamp ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ └────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘ >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... ( ... CSV_data ... .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M")) ... .write_parquet(data_path) ... ) ... direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male", ... "is_female", "brown_eyes"], None) shape: (5, 7) ┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐ │ subject_id ┆ timestamp ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ └────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘ >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... CSV_data.write_csv(data_path) ... direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male", ... "is_female", "brown_eyes"], "%m/%d/%Y %H:%M") shape: (5, 7) ┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐ │ subject_id ┆ timestamp ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 │ └────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘ >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... CSV_data.write_csv(data_path) ... direct_load_plain_predicates(data_path, ["is_discharge", "brown_eyes"], "%m/%d/%Y %H:%M") shape: (5, 4) ┌────────────┬─────────────────────┬──────────────┬────────────┐ │ subject_id ┆ timestamp ┆ is_discharge ┆ brown_eyes │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪══════════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 1 │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 0 ┆ 0 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 1 ┆ 0 │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 0 ┆ 0 │ │ 2 ┆ null ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴──────────────┴────────────┘ >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... CSV_data.write_csv(data_path) ... direct_load_plain_predicates(data_path, ["is_foobar"], "%m/%d/%Y %H:%M") Traceback (most recent call last): ... polars.exceptions.ColumnNotFoundError: ['is_foobar'] >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".foo") as f: ... data_path = Path(f.name) ... CSV_data.write_csv(data_path) ... direct_load_plain_predicates(data_path, ["is_discharge"], "%m/%d/%Y %H:%M") Traceback (most recent call last): ... ValueError: Unsupported file format: .foo >>> with tempfile.TemporaryDirectory() as d: ... data_path = Path(d) / "data.csv" ... assert not data_path.exists() ... direct_load_plain_predicates(data_path, ["is_admission", "is_discharge"], "%m/%d/%Y %H:%M") Traceback (most recent call last): ... FileNotFoundError: Direct predicates file ... does not exist! >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... CSV_data.write_parquet(data_path) ... direct_load_plain_predicates(data_path, ["is_admission", "is_discharge"], None) Traceback (most recent call last): ... ValueError: Must provide a timestamp format for direct predicates with str timestamps. >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... ( ... CSV_data ... .with_columns( ... pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M") ... .dt.timestamp() ... ) ... .write_parquet(data_path) ... ) ... direct_load_plain_predicates(data_path, ["is_admission", "is_discharge"], None) Traceback (most recent call last): ... TypeError: Passed predicates have timestamps of invalid type Int64.
- aces.predicates.generate_plain_predicates_from_esgpt(data_path: Path, predicates: dict) DataFrame[source]¶
Generate plain predicate columns from an ESGPT dataset.
To learn more about the ESGPT format, please visit https://eventstreamml.readthedocs.io/en/latest/
- Parameters:¶
- Returns:¶
The Polars DataFrame containing the extracted predicates per subject per timestamp across the entire ESGPT dataset.
>>> generate_plain_predicates_from_esgpt(Path("/fake/path"), {}) Traceback (most recent call last): ... ImportError: The 'EventStream' package is required to load ESGPT datasets. If you mean to use a MEDS dataset, please specify the 'MEDS' standard. Otherwise, please install the package from https://github.com/mmcdermott/EventStreamGPT and add the package to your PYTHONPATH.
- aces.predicates.generate_plain_predicates_from_meds(data_path: Path, predicates: dict[str, aces.config.PlainPredicateConfig]) DataFrame[source]¶
Generate plain predicate columns from a MEDS dataset.
To learn more about the MEDS format, please visit https://github.com/Medical-Event-Data-Standard/meds
- Parameters:¶
- Returns:¶
The Polars DataFrame containing the extracted predicates per subject per timestamp across the entire MEDS dataset.
Example
>>> parquet_data = pl.DataFrame({ ... "subject_id": [1, 1, 1, 2, 3], ... "time": ["1/1/1989 00:00", "1/1/1989 01:00", "1/1/1989 01:00", "1/1/1989 02:00", None], ... "code": ['admission', 'discharge', 'discharge', 'admission', "gender//male"], ... }).with_columns(pl.col("time").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M")) >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... parquet_data.write_parquet(data_path) ... generate_plain_predicates_from_meds( ... data_path, ... {"discharge": PlainPredicateConfig("discharge"), ... "male": PlainPredicateConfig("gender//male", static=True)} ... ) shape: (4, 4) ┌────────────┬─────────────────────┬───────────┬──────┐ │ subject_id ┆ timestamp ┆ discharge ┆ male │ │ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪═══════════╪══════╡ │ 1 ┆ 1989-01-01 00:00:00 ┆ 0 ┆ 0 │ │ 1 ┆ 1989-01-01 01:00:00 ┆ 2 ┆ 0 │ │ 2 ┆ 1989-01-01 02:00:00 ┆ 0 ┆ 0 │ │ 3 ┆ null ┆ 0 ┆ 1 │ └────────────┴─────────────────────┴───────────┴──────┘
- aces.predicates.get_predicates_df(cfg: TaskExtractorConfig, data_config: DictConfig) DataFrame[source]¶
Generate predicate columns based on the configuration.
- Parameters:¶
- cfg: TaskExtractorConfig¶
The TaskExtractorConfig object containing the predicates information.
- data_path
Path to external data (file path to .csv or .parquet, or ESGPT directory) as string or Path.
- standard
The data standard, either ‘CSV, ‘MEDS’ or ‘ESGPT’.
- Returns:¶
The Polars DataFrame with the added predicate columns.
- Return type:¶
pl.DataFrame
- Raises:¶
ValueError – If an invalid predicate type is specified in the configuration.
Example
>>> from .config import DerivedPredicateConfig, EventConfig, WindowConfig >>> data = pl.DataFrame({ ... "subject_id": [1, 1, 1, 2, 2, 2], ... "timestamp": [ ... None, ... "01/01/2021 00:00", ... "01/01/2021 12:00", ... None, ... "01/02/2021 00:00", ... "01/02/2021 12:00"], ... "adm": [0, 1, 0, 0, 1, 0], ... "dis": [0, 0, 1, 0, 0, 0], ... "death": [0, 0, 0, 0, 0, 1], ... "male": [1, 0, 0, 0, 0, 0], ... "female": [0, 0, 0, 1, 0, 0], ... }) >>> predicates = { ... "adm": PlainPredicateConfig("adm"), ... "dis": PlainPredicateConfig("dis"), ... "death": PlainPredicateConfig("death"), ... "male": PlainPredicateConfig("male", static=True), # predicate match based on name for direct ... "death_or_dis": DerivedPredicateConfig("or(death, dis)"), ... } >>> trigger = EventConfig("adm") >>> windows = { ... "input": WindowConfig( ... start=None, ... end="trigger + 24h", ... start_inclusive=True, ... end_inclusive=True, ... has={"_ANY_EVENT": "(32, None)"}, ... ), ... "gap": WindowConfig( ... start="input.end", ... end="start + 24h", ... start_inclusive=False, ... end_inclusive=True, ... has={ ... "death_or_dis": "(None, 0)", ... "adm": "(None, 0)", ... }, ... ), ... "target": WindowConfig( ... start="gap.end", ... end="start -> death_or_dis", ... start_inclusive=False, ... end_inclusive=True, ... has={}, ... ), ... } >>> config = TaskExtractorConfig(predicates=predicates, trigger=trigger, windows=windows) >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... data.write_csv(data_path) ... data_config = DictConfig({ ... "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M" ... }) ... get_predicates_df(config, data_config) shape: (6, 8) ┌────────────┬─────────────────────┬─────┬─────┬───────┬──────┬──────────────┬────────────┐ │ subject_id ┆ timestamp ┆ adm ┆ dis ┆ death ┆ male ┆ death_or_dis ┆ _ANY_EVENT │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════╪══════════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ null │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 ┆ 1 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ null │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 │ └────────────┴─────────────────────┴─────┴─────┴───────┴──────┴──────────────┴────────────┘ >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f: ... data_path = Path(f.name) ... ( ... data ... .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M")) ... .write_parquet(data_path) ... ) ... data_config = DictConfig({"path": str(data_path), "standard": "direct", "ts_format": None}) ... get_predicates_df(config, data_config) shape: (6, 8) ┌────────────┬─────────────────────┬─────┬─────┬───────┬──────┬──────────────┬────────────┐ │ subject_id ┆ timestamp ┆ adm ┆ dis ┆ death ┆ male ┆ death_or_dis ┆ _ANY_EVENT │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════╪══════════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ null │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 1 ┆ 1 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ null │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 ┆ 1 │ └────────────┴─────────────────────┴─────┴─────┴───────┴──────┴──────────────┴────────────┘ >>> any_event_trigger = EventConfig("_ANY_EVENT") >>> adm_only_predicates = {"adm": PlainPredicateConfig("adm"), "male": PlainPredicateConfig("male")} >>> st_end_windows = { ... "input": WindowConfig( ... start="end - 365d", ... end="trigger + 24h", ... start_inclusive=True, ... end_inclusive=True, ... has={ ... "_RECORD_END": "(None, 0)", # These are added just to show start/end predicates ... "_RECORD_START": "(None, 0)", # These are added just to show start/end predicates ... }, ... ), ... } >>> st_end_config = TaskExtractorConfig( ... predicates=adm_only_predicates, trigger=any_event_trigger, windows=st_end_windows ... ) >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... data.write_csv(data_path) ... data_config = DictConfig({ ... "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M" ... }) ... get_predicates_df(st_end_config, data_config) shape: (6, 7) ┌────────────┬─────────────────────┬─────┬──────┬────────────┬───────────────┬─────────────┐ │ subject_id ┆ timestamp ┆ adm ┆ male ┆ _ANY_EVENT ┆ _RECORD_START ┆ _RECORD_END │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪═════╪══════╪════════════╪═══════════════╪═════════════╡ │ 1 ┆ null ┆ 0 ┆ 1 ┆ null ┆ null ┆ null │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ null ┆ null ┆ null │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ └────────────┴─────────────────────┴─────┴──────┴────────────┴───────────────┴─────────────┘>>> data = pl.DataFrame({ ... "subject_id": [1, 1, 1, 2, 2], ... "timestamp": [ ... None, ... "01/01/2021 00:00", ... "01/01/2021 12:00", ... "01/02/2021 00:00", ... "01/02/2021 12:00"], ... "adm": [0, 1, 0, 1, 0], ... "male": [1, 0, 0, 0, 0], ... }) >>> predicates = { ... "adm": PlainPredicateConfig("adm"), ... "male": PlainPredicateConfig("male", static=True), # predicate match based on name for direct ... "male_adm": DerivedPredicateConfig("and(male, adm)", static=['male']), ... } >>> trigger = EventConfig("adm") >>> windows = { ... "input": WindowConfig( ... start=None, ... end="trigger + 24h", ... start_inclusive=True, ... end_inclusive=True, ... has={"_ANY_EVENT": "(32, None)"}, ... ), ... "gap": WindowConfig( ... start="input.end", ... end="start + 24h", ... start_inclusive=False, ... end_inclusive=True, ... has={ ... "adm": "(None, 0)", ... "male_adm": "(None, 0)", ... }, ... ), ... } >>> config = TaskExtractorConfig(predicates=predicates, trigger=trigger, windows=windows) >>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... data.write_csv(data_path) ... data_config = DictConfig({ ... "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M" ... }) ... get_predicates_df(config, data_config) shape: (5, 6) ┌────────────┬─────────────────────┬─────┬──────┬──────────┬────────────┐ │ subject_id ┆ timestamp ┆ adm ┆ male ┆ male_adm ┆ _ANY_EVENT │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪═════╪══════╪══════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 1 ┆ 0 ┆ null │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 1 ┆ 1 ┆ 1 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 1 │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 1 │ │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ └────────────┴─────────────────────┴─────┴──────┴──────────┴────────────┘>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f: ... data_path = Path(f.name) ... data.write_csv(data_path) ... data_config = DictConfig({ ... "path": str(data_path), "standard": "buzz", "ts_format": "%m/%d/%Y %H:%M" ... }) ... get_predicates_df(config, data_config) Traceback (most recent call last): ... ValueError: Invalid data standard: buzz. Options are 'direct', 'MEDS', 'ESGPT'.
- aces.predicates.process_esgpt_data(subjects_df: DataFrame, events_df: DataFrame, dynamic_measurements_df: DataFrame, value_columns: dict[str, str], predicates: dict) DataFrame[source]¶
Process ESGPT data to generate plain predicate columns.
- Parameters:¶
- subjects_df: DataFrame¶
The Polars DataFrame containing the subjects/demographics data.
- events_df: DataFrame¶
The Polars DataFrame containing the events data.
- dynamic_measurements_df: DataFrame¶
The Polars DataFrame containing the dynamic measurements data.
- value_columns: dict[str, str]¶
A dictionary mapping predicate names to the column name containing numeric values for that predicate, or None if the predicate does not have an associated value column.
- predicates: dict¶
A dictionary mapping predicate names to their PlainPredicateConfig objects.
- Returns:¶
The Polars DataFrame containing the extracted predicates per subject per timestamp across the entire ESGPT dataset.
Examples
>>> subjects_df = pl.DataFrame({ ... "subject_id": [1, 2], ... "MRN": ["A123", "B456"], ... "eye_colour": ["brown", "blue"], ... "dob": [datetime(1980, 1, 1), datetime(1990, 1, 1)], ... }) >>> events_df = pl.DataFrame({ ... "event_id": [1, 2, 3, 4], ... "subject_id": [1, 1, 2, 2], ... "timestamp": [ ... datetime(2021, 1, 1, 0, 0), ... datetime(2021, 1, 1, 12, 0), ... datetime(2021, 1, 2, 0, 0), ... datetime(2021, 1, 2, 12, 0), ... ], ... "event_type": ["adm", "dis", "adm", "obs"], ... "age": [30, 30, 40, 40], ... }) >>> dynamic_measurements_df = pl.DataFrame({ ... "event_id": [1, 1, 1, 2, 2, 2, 3, 4, 5], ... "adm_loc": ["foo", None, None, None, None, None, "bar", None, None], ... "dis_loc": [None, None, None, None, None, "H", None, None, None], ... "HR": [None, 150, None, 120, None, None, None, 177, 89], ... "lab": [None, None, "K", None, "K", None, None, None, "SpO2"], ... "lab_val": [None, None, 5.1, None, 3.8, None, None, None, 99], ... }) >>> value_columns = { ... "is_admission": None, ... "is_discharge": None, ... "high_HR": "HR", ... "high_Potassium": "lab_val", ... } >>> predicates = { ... "is_adm": PlainPredicateConfig(code="event_type//adm"), ... "is_dis": PlainPredicateConfig(code="event_type//dis"), ... "high_HR": PlainPredicateConfig(code="HR", value_min=140), ... "high_Potassium": PlainPredicateConfig(code="lab//K", value_min=5.0), ... "eye_colour": PlainPredicateConfig(code="eye_colour//brown", static=True), ... } >>> process_esgpt_data(subjects_df, events_df, dynamic_measurements_df, value_columns, predicates) shape: (6, 7) ┌────────────┬─────────────────────┬────────┬────────┬─────────┬────────────────┬────────────┐ │ subject_id ┆ timestamp ┆ is_adm ┆ is_dis ┆ high_HR ┆ high_Potassium ┆ eye_colour │ │ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │ │ i64 ┆ datetime[μs] ┆ i64 ┆ i64 ┆ i64 ┆ i64 ┆ i64 │ ╞════════════╪═════════════════════╪════════╪════════╪═════════╪════════════════╪════════════╡ │ 1 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 1 │ │ 2 ┆ null ┆ 0 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 1 ┆ 2021-01-01 00:00:00 ┆ 1 ┆ 0 ┆ 1 ┆ 1 ┆ 0 │ │ 1 ┆ 2021-01-01 12:00:00 ┆ 0 ┆ 1 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ 2021-01-02 00:00:00 ┆ 1 ┆ 0 ┆ 0 ┆ 0 ┆ 0 │ │ 2 ┆ 2021-01-02 12:00:00 ┆ 0 ┆ 0 ┆ 1 ┆ 0 ┆ 0 │ └────────────┴─────────────────────┴────────┴────────┴─────────┴────────────────┴────────────┘