Source code for niimpy.reading.mhealth

"""Read data from various formats, user entery point.

This module contains various functions `read_*` which load data from different
formats into pandas.DataFrame:s.  As a side effect, it provides the
authoritative information on how incoming data is converted to dataframes.

"""

import pandas as pd
import json


mHealth_duration_units = {
    "ps": "picoseconds",
    "ns": "nanoseconds",
    "us": "microseconds",
    "ms": "milliseconds",
    "sec": "seconds",
    "min": "minutes",
    "h": "hours",
    "d": "days",
    "wk": "weeks",
    "Mo": "months",
    "yr": "years",
}



[docs]
def format_part_of_day(df, prefix):
    ''' Format columns with mHealth formatted part of day. Returns a dataframe
    with date stored in the column "date" and "part_of_day". Options for
    part of day are "morning", "afternoon", "evening", "night".
    '''

    date_col = f'{prefix}.date'
    part_of_day_col = f'{prefix}.part_of_day'

    df = df.rename(columns={
        date_col: "date",
        part_of_day_col: "part_of_day",
    })

    rows = ~df["part_of_day"].isnull()
    df["date"] = pd.to_datetime(df["date"], utc=True)
    df.loc[rows, "timestamp"] = df.loc[rows, "date"]
    return df




[docs]
def duration_to_timedelta(df, duration_col):
    ''' Format a duration entry in the mHealth format. Duration
    is a dictionary that contains a value and a unit. The 
    dataframe should contain two columns, DURATION_COL_NAME.value
    and DURATION_COL_NAME.unit.

    Returns a dataframe with a DURATION_COL_NAME column containing a
    a timedelta object. The original columns will be dropped.
    '''
    duration_value_col = f'{duration_col}.value'
    duration_unit_col = f'{duration_col}.unit'

    def format_duration_row(row):
        unit = mHealth_duration_units[row[duration_unit_col]]
        value = row[duration_value_col]

        if unit == "picoseconds": # DateOffset doesn't support picoseconds
            unit = "nanoseconds"
            value *= 0.001

        return pd.to_timedelta(value, unit=unit)

    if duration_value_col in df.columns:
        # Format duration as DateOffset. We use this below to calculate either start of end
        rows = ~df[duration_value_col].isnull()
        df.loc[rows, duration_col] = df.loc[rows].apply(format_duration_row, axis=1)

    df = df.drop([duration_value_col, duration_unit_col], axis=1)
    return df




[docs]
def format_time_interval(df, prefix):
    ''' Format a database containing columns in the mHealth time interval.
    
    A time interval in the mHealth format has either
     - a date and a time of day (morning, afternoon, evening or night), or
     - any two of start time, end time and duration.

    In the first case, the formatted database will contain two columns:
    measure_date and time_of_day.

    In the second case, the formatted database will contain two columns:
    start and end.

    Also sets the timestamp to "start" or "date" if available.
    '''    
    # Create any of the result columns thay may not exist
    for col in ["start", "end"]:
        if col not in df.columns:
            df[col] = pd.Series(pd.NaT, index=df.index)
            df[col] = pd.to_datetime(df[col], utc=True)

    if f'{prefix}.part_of_day' in df.columns:
        df = format_part_of_day(df, prefix)

    # Construct column names from the prefix
    start_col = f'{prefix}.start_date_time'
    end_col = f'{prefix}.end_date_time'
    duration_col = f'{prefix}.duration'

    # Format the date-like columns. All the mHealth columns might exist if the formatting in the data is mixed
    for col, mHealth_col in [("start", start_col), ("end", end_col)]:
        if mHealth_col in df.columns:
            df[col] = pd.to_datetime(df[mHealth_col], utc=True)

    # Format duration as DateOffset. We use this below to calculate either start of end
    df = duration_to_timedelta(df, duration_col)

    # If duration is provided, we calculate either start or end
    if start_col in df.columns and duration_col in df.columns:
        rows = ~df[duration_col].isnull() & ~df[start_col].isnull()
        df.loc[rows, "end"] = df.loc[rows, "start"] + df.loc[rows, duration_col]
        df["end"] = pd.to_datetime(df["end"], utc=True)

    if end_col in df.columns and duration_col in df.columns:
        rows = ~df[end_col].isnull() & ~df[duration_col].isnull()
        df.loc[rows, "start"] = df.loc[rows, "end"] - df.loc[rows, duration_col]
        df["start"] = pd.to_datetime(df["start"], utc=True)


    # Drop the original columns
    for col in [start_col, end_col, duration_col]:
        if col in df.columns:
            df = df.drop(col, axis=1)

    # Set timestamp to the start time
    df["timestamp"] = df["start"]

    # where start time is abset (happens when time interval is given as
    # date and part of day), set timestamp to date
    if "date" in df.columns:
        rows = df["timestamp"].isnull()
        df.loc[rows, "timestamp"] = df.loc[rows, "date"]
        df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)

    return df





[docs]
def total_sleep_time(data_list):
    '''
    Format mHealth total sleep data from json formatted data to a Niimpy
    compatible DataFrame. The DataFrame contains the columns
        - total_sleep_time : The total sleep time measurement
        - total_sleep_time_unit : The unit the measuremnet is expressed in
        - measurement interval columns
        - possible descriptive statistics columns

    The measurement interval column are either
        - start : start time of the measurement interval
        - end : end time of the measurement interval
    or
        - date : the date of the measurement
        - part_of_day : the time of day the measurement was made
    
    The descriptive statistics columns would be
        - descriptive_statistics : Describes how the measurement is calculated
        - descriptive_statistics_denominator : Time interval the above desciption refers to.

    The dataframe is indexed by "timestamp", which is either the "start" or the "date".

    Parameters
    ----------

    data_list: list of dictionaries
        MHealth formatted sleep duration data loaded with json.load()
    
    Returns
    -------

    data: A pandas.DataFrame containing sleep duration data

    
    '''
    
    df = pd.json_normalize(data_list)

    df = duration_to_timedelta(df, "total_sleep_time")
    df = format_time_interval(df, "effective_time_frame.time_interval")

    df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
    df.set_index('timestamp', inplace=True)
    return df




[docs]
def total_sleep_time_from_file(filename):
    ''' Read mHealth total sleep time from a file and convert into a Niimpy
    compatible DataFrame using total_sleep_time. The dataframe
    contains the columns
        - total_sleep_time : The total sleep time measurement
        - total_sleep_time_unit : The unit the measuremnet is expressed in
        - measurement interval columns
        - possible descriptive statistics columns

    The measurement interval column are either
        - start : start time of the measurement interval
        - end : end time of the measurement interval
    or
        - date : the date of the measurement
        - part_of_day : the time of day the measurement was made
    
    The descriptive statistics columns would be
        - descriptive_statistics : Describes how the measurement is calculated
        - descriptive_statistics_denomirator : Time interval the above 
          description refers to.

    The dataframe is indexed by "timestamp", which is either the "start" or the "date".

    Parameters
    ----------

    filename: string
        Path to the file containing mhealth formatted sleep duration data.
    
    Returns
    -------

    data: A pandas.DataFrame containing sleep duration data
    
    '''
    with open(filename) as f:
        data = json.load(f)

    df = total_sleep_time(data)

    return df



[docs]
def heart_rate(data_list):
    ''' Format the heart rate json data into a niimpy dataframe.

    The dataframe contains the columns
        - heart_rate : Heart rate measurement in beats per minute
        - (optional) time interval columns
        - (optional) descriptive statistics column, a string
        - (optional) temporal relationship to sleep column, a string
        - (optional) temporal relationship to physical activity column, a 
           string

    Measurement time or interval columns. If exact time is given, only the
    index is set. If a time interval is given, we set two additional columns
        - start : start time of the measurement interval
        - end : end time of the measurement interval
    and set the index to the start time.
    
    The descriptive statistics column describes how the value is calculated
    over the given time interval. For example, "average" would denote a mean
    over the time period.

    The temporal relationship to sleep is one of "before sleeping", "during 
    sleep" or "on waking".

    The temporal relationship to physical activity is one of "at rest", 
    "active", "before exercise", "after exercise" or "during exercise".

    Parameters
    ----------

    data_list: list of dictionaries
        MHealth formatted heart rate data loaded using json.load().

    Returns
    -------

    data: A pandas.DataFrame containing geolocation data
    '''
    df = pd.json_normalize(data_list)

    # The mhealth standard specifies the unit is always beats per minute.
    # We can drop the unit column without converting the value
    df.drop("heart_rate.unit", axis=1, inplace=True)
    df = df.rename(columns={
        "heart_rate.value": "heart_rate",
    })

    # Each sample contains a time_frame, which is either a duration or a
    # date-time string
    rows = ~df["effective_time_frame.date_time"].isna()
    df.loc[rows, "timestamp"] = pd.to_datetime(df["effective_time_frame.date_time"], utc=True)

    df = format_time_interval(df, "effective_time_frame.time_interval")

    df.set_index('timestamp', inplace=True)
    return df




[docs]
def heart_rate_from_file(filename):
    '''Read mHealth formatted heart rate data from a file and convert it to
    a Niimpy compatible dataframe.  

    The dataframe contains the columns
        - heart_rate : Heart rate measurement in beats per minute
        - (optional) time interval columns
        - (optional) descriptive statistics column, a string
        - (optional) temporal relationship to sleep column, a string
        - (optional) temporal relationship to physical activity column, a 
           string

    Measurement time or interval columns. If exact time is given, only the
    index is set. If a time interval is given, we set two additional columns
        - start : start time of the measurement interval
        - end : end time of the measurement interval
    and set the index to the start time.
    
    The descriptive statistics column describes how the value is calculated
    over the given time interval. For example, "average" would denote a mean
    over the time period.

    The temporal relationship to sleep is one of "before sleeping", "during 
    sleep" or "on waking".

    The temporal relationship to physical activity is one of "at rest", 
    "active", "before exercise", "after exercise" or "during exercise".

    Parameters
    ----------

    filename: string
        Path to the file containing mhealth formatted heart rate data.
    
    Returns
    -------

    data: A pandas.DataFrame containing heart rate data
    '''

    with open(filename) as f:
        data = json.load(f)

    df = heart_rate(data)

    return df





[docs]
def geolocation(data_list):
    ''' Format the geolocation json data into a niimpy dataframe.

    Parameters
    ----------

    data_list: list of dictionaries
        MHealth formatted geolocation data loaded using json.load().

    Returns
    -------

    data: A pandas.DataFrame containing geolocation data
    '''
    df = pd.json_normalize(data_list)

    # Keep rows where latitude and longitude are given correctly
    df = df[df["latitude.unit"] == "deg"]
    df = df[df["longitude.unit"] == "deg"]
    df.rename(columns={
        "latitude.value": "latitude",
        "longitude.value": "longitude",
    }, inplace=True)

    return df




[docs]
def geolocation_from_file(filename):
    '''Read mHealth formatted geolocation data from a file and convert it to
    a Niimpy compatible dataframe.  

    Parameters
    ----------

    filename: string
        Path to the file containing mhealth formatted geolocation data.
    
    Returns
    -------

    data: A pandas.DataFrame containing geolocation data
    '''

    with open(filename) as f:
        data = json.load(f)

    df = geolocation(data)

    return df