aces.predicates module

This module contains functions for generating predicate columns for event sequences.

aces.predicates.direct_load_plain_predicates(data_path: Path, predicates: list[str], ts_format: str | None) DataFrame[source]

Loads a CSV file from disk and verifies that the necessary plain predicate columns are present.

This CSV file must have the following columns:
  • subject_id: The subject identifier.

  • timestamp: The timestamp of the event, in the format “MM/DD/YYYY HH:MM”.

  • Any additional columns specified in the set of desired plain predicates.

Parameters:
data_path: Path

The path to the CSV file.

predicates: list[str]

The list of columns to read from the CSV file.

ts_format: str | None

The format string for parsing timestamps, or None if timestamps are already temporal.

Returns:

The Polars DataFrame containing the specified columns.

Example

>>> CSV_data = pl.DataFrame({
...     "subject_id": [1, 1, 1, 1, 2, 2],
...     "timestamp": [None, "01/01/2021 00:00", None, "01/01/2021 12:00", "01/02/2021 00:00", None],
...     "is_admission": [0, 1, 0, 0, 1, 0],
...     "is_discharge": [0, 0, 0, 1, 0, 0],
...     "is_male": [1, 0, 0, 0, 0, 0],
...     "is_female": [0, 0, 0, 0, 0, 1],
...     "brown_eyes": [0, 0, 1, 0, 0, 0],
... })
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f:
...     data_path = Path(f.name)
...     CSV_data.write_parquet(data_path)
...     direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male",
...          "is_female", "brown_eyes"], "%m/%d/%Y %H:%M")
shape: (5, 7)
┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐
│ subject_id ┆ timestamp           ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │
│ ---        ┆ ---                 ┆ ---          ┆ ---          ┆ ---     ┆ ---       ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64          ┆ i64          ┆ i64     ┆ i64       ┆ i64        │
╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡
│ 1          ┆ null                ┆ 0            ┆ 0            ┆ 1       ┆ 0         ┆ 1          │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0            ┆ 1            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ null                ┆ 0            ┆ 0            ┆ 0       ┆ 1         ┆ 0          │
└────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘

If the timestamp column is already a timestamp, then the ts_format argument id not needed, but can be used without an error.

>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f:
...     data_path = Path(f.name)
...     (
...         CSV_data
...         .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M"))
...         .write_parquet(data_path)
...     )
...     direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male",
...          "is_female", "brown_eyes"], "%m/%d/%Y %H:%M")
shape: (5, 7)
┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐
│ subject_id ┆ timestamp           ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │
│ ---        ┆ ---                 ┆ ---          ┆ ---          ┆ ---     ┆ ---       ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64          ┆ i64          ┆ i64     ┆ i64       ┆ i64        │
╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡
│ 1          ┆ null                ┆ 0            ┆ 0            ┆ 1       ┆ 0         ┆ 1          │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0            ┆ 1            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ null                ┆ 0            ┆ 0            ┆ 0       ┆ 1         ┆ 0          │
└────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f:
...     data_path = Path(f.name)
...     (
...         CSV_data
...         .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M"))
...         .write_parquet(data_path)
...     )
...     direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male",
...          "is_female", "brown_eyes"], None)
shape: (5, 7)
┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐
│ subject_id ┆ timestamp           ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │
│ ---        ┆ ---                 ┆ ---          ┆ ---          ┆ ---     ┆ ---       ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64          ┆ i64          ┆ i64     ┆ i64       ┆ i64        │
╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡
│ 1          ┆ null                ┆ 0            ┆ 0            ┆ 1       ┆ 0         ┆ 1          │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0            ┆ 1            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ null                ┆ 0            ┆ 0            ┆ 0       ┆ 1         ┆ 0          │
└────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f:
...     data_path = Path(f.name)
...     CSV_data.write_csv(data_path)
...     direct_load_plain_predicates(data_path, ["is_admission", "is_discharge", "is_male",
...          "is_female", "brown_eyes"], "%m/%d/%Y %H:%M")
shape: (5, 7)
┌────────────┬─────────────────────┬──────────────┬──────────────┬─────────┬───────────┬────────────┐
│ subject_id ┆ timestamp           ┆ is_admission ┆ is_discharge ┆ is_male ┆ is_female ┆ brown_eyes │
│ ---        ┆ ---                 ┆ ---          ┆ ---          ┆ ---     ┆ ---       ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64          ┆ i64          ┆ i64     ┆ i64       ┆ i64        │
╞════════════╪═════════════════════╪══════════════╪══════════════╪═════════╪═══════════╪════════════╡
│ 1          ┆ null                ┆ 0            ┆ 0            ┆ 1       ┆ 0         ┆ 1          │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0            ┆ 1            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1            ┆ 0            ┆ 0       ┆ 0         ┆ 0          │
│ 2          ┆ null                ┆ 0            ┆ 0            ┆ 0       ┆ 1         ┆ 0          │
└────────────┴─────────────────────┴──────────────┴──────────────┴─────────┴───────────┴────────────┘
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f:
...     data_path = Path(f.name)
...     CSV_data.write_csv(data_path)
...     direct_load_plain_predicates(data_path, ["is_discharge", "brown_eyes"], "%m/%d/%Y %H:%M")
shape: (5, 4)
┌────────────┬─────────────────────┬──────────────┬────────────┐
│ subject_id ┆ timestamp           ┆ is_discharge ┆ brown_eyes │
│ ---        ┆ ---                 ┆ ---          ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64          ┆ i64        │
╞════════════╪═════════════════════╪══════════════╪════════════╡
│ 1          ┆ null                ┆ 0            ┆ 1          │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 0            ┆ 0          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 1            ┆ 0          │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 0            ┆ 0          │
│ 2          ┆ null                ┆ 0            ┆ 0          │
└────────────┴─────────────────────┴──────────────┴────────────┘
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f:
...     data_path = Path(f.name)
...     CSV_data.write_csv(data_path)
...     direct_load_plain_predicates(data_path, ["is_foobar"], "%m/%d/%Y %H:%M")
Traceback (most recent call last):
    ...
polars.exceptions.ColumnNotFoundError: ['is_foobar']
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".foo") as f:
...     data_path = Path(f.name)
...     CSV_data.write_csv(data_path)
...     direct_load_plain_predicates(data_path, ["is_discharge"], "%m/%d/%Y %H:%M")
Traceback (most recent call last):
    ...
ValueError: Unsupported file format: .foo
>>> with tempfile.TemporaryDirectory() as d:
...     data_path = Path(d) / "data.csv"
...     assert not data_path.exists()
...     direct_load_plain_predicates(data_path, ["is_admission", "is_discharge"], "%m/%d/%Y %H:%M")
Traceback (most recent call last):
    ...
FileNotFoundError: Direct predicates file ... does not exist!
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f:
...     data_path = Path(f.name)
...     CSV_data.write_parquet(data_path)
...     direct_load_plain_predicates(data_path, ["is_admission", "is_discharge"], None)
Traceback (most recent call last):
    ...
ValueError: Must provide a timestamp format for direct predicates with str timestamps.
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f:
...     data_path = Path(f.name)
...     (
...         CSV_data
...         .with_columns(
...             pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M")
...             .dt.timestamp()
...         )
...         .write_parquet(data_path)
...     )
...     direct_load_plain_predicates(data_path, ["is_admission", "is_discharge"], None)
Traceback (most recent call last):
    ...
TypeError: Passed predicates have timestamps of invalid type Int64.
aces.predicates.generate_plain_predicates_from_esgpt(data_path: Path, predicates: dict) DataFrame[source]

Generate plain predicate columns from an ESGPT dataset.

To learn more about the ESGPT format, please visit https://eventstreamml.readthedocs.io/en/latest/

Parameters:
data_path: Path

The path to the ESGPT dataset directory.

predicates: dict

The dictionary of plain predicate configurations.

Returns:

The Polars DataFrame containing the extracted predicates per subject per timestamp across the entire ESGPT dataset.

>>> generate_plain_predicates_from_esgpt(Path("/fake/path"), {})
Traceback (most recent call last):
    ...
ImportError: The 'EventStream' package is required to load ESGPT datasets. If you mean to use a
MEDS dataset, please specify the 'MEDS' standard. Otherwise, please install the package from
https://github.com/mmcdermott/EventStreamGPT and add the package to your PYTHONPATH.

aces.predicates.generate_plain_predicates_from_meds(data_path: Path, predicates: dict[str, aces.config.PlainPredicateConfig]) DataFrame[source]

Generate plain predicate columns from a MEDS dataset.

To learn more about the MEDS format, please visit https://github.com/Medical-Event-Data-Standard/meds

Parameters:
data_path: Path

The path to the MEDS dataset file.

predicates: dict[str, aces.config.PlainPredicateConfig]

The dictionary of plain predicate configurations.

Returns:

The Polars DataFrame containing the extracted predicates per subject per timestamp across the entire MEDS dataset.

Example

>>> parquet_data = pl.DataFrame({
...     "subject_id": [1, 1, 1, 2, 3],
...     "time": ["1/1/1989 00:00", "1/1/1989 01:00", "1/1/1989 01:00", "1/1/1989 02:00", None],
...     "code": ['admission', 'discharge', 'discharge', 'admission', "gender//male"],
... }).with_columns(pl.col("time").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M"))
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f:
...     data_path = Path(f.name)
...     parquet_data.write_parquet(data_path)
...     generate_plain_predicates_from_meds(
...         data_path,
...         {"discharge": PlainPredicateConfig("discharge"),
...             "male": PlainPredicateConfig("gender//male", static=True)}
...     )
shape: (4, 4)
┌────────────┬─────────────────────┬───────────┬──────┐
│ subject_id ┆ timestamp           ┆ discharge ┆ male │
│ ---        ┆ ---                 ┆ ---       ┆ ---  │
│ i64        ┆ datetime[μs]        ┆ i64       ┆ i64  │
╞════════════╪═════════════════════╪═══════════╪══════╡
│ 1          ┆ 1989-01-01 00:00:00 ┆ 0         ┆ 0    │
│ 1          ┆ 1989-01-01 01:00:00 ┆ 2         ┆ 0    │
│ 2          ┆ 1989-01-01 02:00:00 ┆ 0         ┆ 0    │
│ 3          ┆ null                ┆ 0         ┆ 1    │
└────────────┴─────────────────────┴───────────┴──────┘
aces.predicates.get_predicates_df(cfg: TaskExtractorConfig, data_config: DictConfig) DataFrame[source]

Generate predicate columns based on the configuration.

Parameters:
cfg: TaskExtractorConfig

The TaskExtractorConfig object containing the predicates information.

data_path

Path to external data (file path to .csv or .parquet, or ESGPT directory) as string or Path.

standard

The data standard, either ‘CSV, ‘MEDS’ or ‘ESGPT’.

Returns:

The Polars DataFrame with the added predicate columns.

Return type:

pl.DataFrame

Raises:

ValueError – If an invalid predicate type is specified in the configuration.

Example

>>> from .config import DerivedPredicateConfig, EventConfig, WindowConfig
>>> data = pl.DataFrame({
...     "subject_id": [1, 1, 1, 2, 2, 2],
...     "timestamp": [
...         None,
...         "01/01/2021 00:00",
...         "01/01/2021 12:00",
...         None,
...         "01/02/2021 00:00",
...         "01/02/2021 12:00"],
...     "adm":       [0, 1, 0, 0, 1, 0],
...     "dis":       [0, 0, 1, 0, 0, 0],
...     "death":     [0, 0, 0, 0, 0, 1],
...     "male":      [1, 0, 0, 0, 0, 0],
...     "female":    [0, 0, 0, 1, 0, 0],
... })
>>> predicates = {
...     "adm": PlainPredicateConfig("adm"),
...     "dis": PlainPredicateConfig("dis"),
...     "death": PlainPredicateConfig("death"),
...     "male": PlainPredicateConfig("male", static=True), # predicate match based on name for direct
...     "death_or_dis": DerivedPredicateConfig("or(death, dis)"),
... }
>>> trigger = EventConfig("adm")
>>> windows = {
...     "input": WindowConfig(
...         start=None,
...         end="trigger + 24h",
...         start_inclusive=True,
...         end_inclusive=True,
...         has={"_ANY_EVENT": "(32, None)"},
...     ),
...     "gap": WindowConfig(
...         start="input.end",
...         end="start + 24h",
...         start_inclusive=False,
...         end_inclusive=True,
...         has={
...             "death_or_dis": "(None, 0)",
...             "adm": "(None, 0)",
...         },
...     ),
...     "target": WindowConfig(
...         start="gap.end",
...         end="start -> death_or_dis",
...         start_inclusive=False,
...         end_inclusive=True,
...         has={},
...     ),
... }
>>> config = TaskExtractorConfig(predicates=predicates, trigger=trigger, windows=windows)
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f:
...     data_path = Path(f.name)
...     data.write_csv(data_path)
...     data_config = DictConfig({
...         "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M"
...     })
...     get_predicates_df(config, data_config)
shape: (6, 8)
┌────────────┬─────────────────────┬─────┬─────┬───────┬──────┬──────────────┬────────────┐
│ subject_id ┆ timestamp           ┆ adm ┆ dis ┆ death ┆ male ┆ death_or_dis ┆ _ANY_EVENT │
│ ---        ┆ ---                 ┆ --- ┆ --- ┆ ---   ┆ ---  ┆ ---          ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64 ┆ i64 ┆ i64   ┆ i64  ┆ i64          ┆ i64        │
╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════╪══════════════╪════════════╡
│ 1          ┆ null                ┆ 0   ┆ 0   ┆ 0     ┆ 1    ┆ 0            ┆ null       │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1   ┆ 0   ┆ 0     ┆ 0    ┆ 0            ┆ 1          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0   ┆ 1   ┆ 0     ┆ 0    ┆ 1            ┆ 1          │
│ 2          ┆ null                ┆ 0   ┆ 0   ┆ 0     ┆ 0    ┆ 0            ┆ null       │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1   ┆ 0   ┆ 0     ┆ 0    ┆ 0            ┆ 1          │
│ 2          ┆ 2021-01-02 12:00:00 ┆ 0   ┆ 0   ┆ 1     ┆ 0    ┆ 1            ┆ 1          │
└────────────┴─────────────────────┴─────┴─────┴───────┴──────┴──────────────┴────────────┘
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".parquet") as f:
...     data_path = Path(f.name)
...     (
...         data
...         .with_columns(pl.col("timestamp").str.strptime(pl.Datetime, format="%m/%d/%Y %H:%M"))
...         .write_parquet(data_path)
...     )
...     data_config = DictConfig({"path": str(data_path), "standard": "direct", "ts_format": None})
...     get_predicates_df(config, data_config)
shape: (6, 8)
┌────────────┬─────────────────────┬─────┬─────┬───────┬──────┬──────────────┬────────────┐
│ subject_id ┆ timestamp           ┆ adm ┆ dis ┆ death ┆ male ┆ death_or_dis ┆ _ANY_EVENT │
│ ---        ┆ ---                 ┆ --- ┆ --- ┆ ---   ┆ ---  ┆ ---          ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64 ┆ i64 ┆ i64   ┆ i64  ┆ i64          ┆ i64        │
╞════════════╪═════════════════════╪═════╪═════╪═══════╪══════╪══════════════╪════════════╡
│ 1          ┆ null                ┆ 0   ┆ 0   ┆ 0     ┆ 1    ┆ 0            ┆ null       │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1   ┆ 0   ┆ 0     ┆ 0    ┆ 0            ┆ 1          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0   ┆ 1   ┆ 0     ┆ 0    ┆ 1            ┆ 1          │
│ 2          ┆ null                ┆ 0   ┆ 0   ┆ 0     ┆ 0    ┆ 0            ┆ null       │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1   ┆ 0   ┆ 0     ┆ 0    ┆ 0            ┆ 1          │
│ 2          ┆ 2021-01-02 12:00:00 ┆ 0   ┆ 0   ┆ 1     ┆ 0    ┆ 1            ┆ 1          │
└────────────┴─────────────────────┴─────┴─────┴───────┴──────┴──────────────┴────────────┘
>>> any_event_trigger = EventConfig("_ANY_EVENT")
>>> adm_only_predicates = {"adm": PlainPredicateConfig("adm"), "male": PlainPredicateConfig("male")}
>>> st_end_windows = {
...     "input": WindowConfig(
...         start="end - 365d",
...         end="trigger + 24h",
...         start_inclusive=True,
...         end_inclusive=True,
...         has={
...             "_RECORD_END": "(None, 0)",   # These are added just to show start/end predicates
...             "_RECORD_START": "(None, 0)", # These are added just to show start/end predicates
...         },
...     ),
... }
>>> st_end_config = TaskExtractorConfig(
...     predicates=adm_only_predicates, trigger=any_event_trigger, windows=st_end_windows
... )
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f:
...     data_path = Path(f.name)
...     data.write_csv(data_path)
...     data_config = DictConfig({
...         "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M"
...     })
...     get_predicates_df(st_end_config, data_config)
shape: (6, 7)
┌────────────┬─────────────────────┬─────┬──────┬────────────┬───────────────┬─────────────┐
│ subject_id ┆ timestamp           ┆ adm ┆ male ┆ _ANY_EVENT ┆ _RECORD_START ┆ _RECORD_END │
│ ---        ┆ ---                 ┆ --- ┆ ---  ┆ ---        ┆ ---           ┆ ---         │
│ i64        ┆ datetime[μs]        ┆ i64 ┆ i64  ┆ i64        ┆ i64           ┆ i64         │
╞════════════╪═════════════════════╪═════╪══════╪════════════╪═══════════════╪═════════════╡
│ 1          ┆ null                ┆ 0   ┆ 1    ┆ null       ┆ null          ┆ null        │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1   ┆ 0    ┆ 1          ┆ 1             ┆ 0           │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0   ┆ 0    ┆ 1          ┆ 0             ┆ 1           │
│ 2          ┆ null                ┆ 0   ┆ 0    ┆ null       ┆ null          ┆ null        │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1   ┆ 0    ┆ 1          ┆ 1             ┆ 0           │
│ 2          ┆ 2021-01-02 12:00:00 ┆ 0   ┆ 0    ┆ 1          ┆ 0             ┆ 1           │
└────────────┴─────────────────────┴─────┴──────┴────────────┴───────────────┴─────────────┘
>>> data = pl.DataFrame({
...     "subject_id": [1, 1, 1, 2, 2],
...     "timestamp": [
...         None,
...         "01/01/2021 00:00",
...         "01/01/2021 12:00",
...         "01/02/2021 00:00",
...         "01/02/2021 12:00"],
...     "adm":       [0, 1, 0, 1, 0],
...     "male":      [1, 0, 0, 0, 0],
... })
>>> predicates = {
...     "adm": PlainPredicateConfig("adm"),
...     "male": PlainPredicateConfig("male", static=True), # predicate match based on name for direct
...     "male_adm": DerivedPredicateConfig("and(male, adm)", static=['male']),
... }
>>> trigger = EventConfig("adm")
>>> windows = {
...     "input": WindowConfig(
...         start=None,
...         end="trigger + 24h",
...         start_inclusive=True,
...         end_inclusive=True,
...         has={"_ANY_EVENT": "(32, None)"},
...     ),
...     "gap": WindowConfig(
...         start="input.end",
...         end="start + 24h",
...         start_inclusive=False,
...         end_inclusive=True,
...         has={
...             "adm": "(None, 0)",
...             "male_adm": "(None, 0)",
...         },
...     ),
... }
>>> config = TaskExtractorConfig(predicates=predicates, trigger=trigger, windows=windows)
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f:
...     data_path = Path(f.name)
...     data.write_csv(data_path)
...     data_config = DictConfig({
...         "path": str(data_path), "standard": "direct", "ts_format": "%m/%d/%Y %H:%M"
...     })
...     get_predicates_df(config, data_config)
shape: (5, 6)
┌────────────┬─────────────────────┬─────┬──────┬──────────┬────────────┐
│ subject_id ┆ timestamp           ┆ adm ┆ male ┆ male_adm ┆ _ANY_EVENT │
│ ---        ┆ ---                 ┆ --- ┆ ---  ┆ ---      ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64 ┆ i64  ┆ i64      ┆ i64        │
╞════════════╪═════════════════════╪═════╪══════╪══════════╪════════════╡
│ 1          ┆ null                ┆ 0   ┆ 1    ┆ 0        ┆ null       │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1   ┆ 1    ┆ 1        ┆ 1          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0   ┆ 1    ┆ 0        ┆ 1          │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1   ┆ 0    ┆ 0        ┆ 1          │
│ 2          ┆ 2021-01-02 12:00:00 ┆ 0   ┆ 0    ┆ 0        ┆ 1          │
└────────────┴─────────────────────┴─────┴──────┴──────────┴────────────┘
>>> with tempfile.NamedTemporaryFile(mode="w", suffix=".csv") as f:
...     data_path = Path(f.name)
...     data.write_csv(data_path)
...     data_config = DictConfig({
...         "path": str(data_path), "standard": "buzz", "ts_format": "%m/%d/%Y %H:%M"
...     })
...     get_predicates_df(config, data_config)
Traceback (most recent call last):
    ...
ValueError: Invalid data standard: buzz. Options are 'direct', 'MEDS', 'ESGPT'.
aces.predicates.process_esgpt_data(subjects_df: DataFrame, events_df: DataFrame, dynamic_measurements_df: DataFrame, value_columns: dict[str, str], predicates: dict) DataFrame[source]

Process ESGPT data to generate plain predicate columns.

Parameters:
subjects_df: DataFrame

The Polars DataFrame containing the subjects/demographics data.

events_df: DataFrame

The Polars DataFrame containing the events data.

dynamic_measurements_df: DataFrame

The Polars DataFrame containing the dynamic measurements data.

value_columns: dict[str, str]

A dictionary mapping predicate names to the column name containing numeric values for that predicate, or None if the predicate does not have an associated value column.

predicates: dict

A dictionary mapping predicate names to their PlainPredicateConfig objects.

Returns:

The Polars DataFrame containing the extracted predicates per subject per timestamp across the entire ESGPT dataset.

Examples

>>> subjects_df = pl.DataFrame({
...    "subject_id": [1, 2],
...    "MRN": ["A123", "B456"],
...    "eye_colour": ["brown", "blue"],
...    "dob": [datetime(1980, 1, 1), datetime(1990, 1, 1)],
... })
>>> events_df = pl.DataFrame({
...    "event_id": [1, 2, 3, 4],
...    "subject_id": [1, 1, 2, 2],
...    "timestamp": [
...         datetime(2021, 1, 1, 0, 0),
...         datetime(2021, 1, 1, 12, 0),
...         datetime(2021, 1, 2, 0, 0),
...         datetime(2021, 1, 2, 12, 0),
...    ],
...    "event_type": ["adm", "dis", "adm", "obs"],
...    "age": [30, 30, 40, 40],
... })
>>> dynamic_measurements_df = pl.DataFrame({
...    "event_id": [1,     1,    1,    2,    2,    2,    3,     4,    5],
...    "adm_loc":  ["foo", None, None, None, None, None, "bar", None, None],
...    "dis_loc":  [None,  None, None, None, None, "H",  None,  None, None],
...    "HR":       [None,  150,  None, 120,  None, None, None,  177,  89],
...    "lab":      [None,  None, "K",  None, "K",  None, None,  None, "SpO2"],
...    "lab_val":  [None,  None, 5.1,  None, 3.8,  None, None,  None, 99],
... })
>>> value_columns = {
...    "is_admission": None,
...    "is_discharge": None,
...    "high_HR": "HR",
...    "high_Potassium": "lab_val",
... }
>>> predicates = {
...    "is_adm": PlainPredicateConfig(code="event_type//adm"),
...    "is_dis": PlainPredicateConfig(code="event_type//dis"),
...    "high_HR": PlainPredicateConfig(code="HR", value_min=140),
...    "high_Potassium": PlainPredicateConfig(code="lab//K", value_min=5.0),
...    "eye_colour": PlainPredicateConfig(code="eye_colour//brown", static=True),
... }
>>> process_esgpt_data(subjects_df, events_df, dynamic_measurements_df, value_columns, predicates)
shape: (6, 7)
┌────────────┬─────────────────────┬────────┬────────┬─────────┬────────────────┬────────────┐
│ subject_id ┆ timestamp           ┆ is_adm ┆ is_dis ┆ high_HR ┆ high_Potassium ┆ eye_colour │
│ ---        ┆ ---                 ┆ ---    ┆ ---    ┆ ---     ┆ ---            ┆ ---        │
│ i64        ┆ datetime[μs]        ┆ i64    ┆ i64    ┆ i64     ┆ i64            ┆ i64        │
╞════════════╪═════════════════════╪════════╪════════╪═════════╪════════════════╪════════════╡
│ 1          ┆ null                ┆ 0      ┆ 0      ┆ 0       ┆ 0              ┆ 1          │
│ 2          ┆ null                ┆ 0      ┆ 0      ┆ 0       ┆ 0              ┆ 0          │
│ 1          ┆ 2021-01-01 00:00:00 ┆ 1      ┆ 0      ┆ 1       ┆ 1              ┆ 0          │
│ 1          ┆ 2021-01-01 12:00:00 ┆ 0      ┆ 1      ┆ 0       ┆ 0              ┆ 0          │
│ 2          ┆ 2021-01-02 00:00:00 ┆ 1      ┆ 0      ┆ 0       ┆ 0              ┆ 0          │
│ 2          ┆ 2021-01-02 12:00:00 ┆ 0      ┆ 0      ┆ 1       ┆ 0              ┆ 0          │
└────────────┴─────────────────────┴────────┴────────┴─────────┴────────────────┴────────────┘