Source code for niimpy.preprocessing.tracker

import pandas as pd
from niimpy.preprocessing import util

group_by_columns = ["user", "device"]



[docs]
def step_summary(df, value_col='values', user_id=None, start_date=None, end_date=None):
    # value_col='values', user_id=None, start_date=None, end_date=None):
    """Return the summary of step count in a time range. The summary includes the following information
    of step count per day: mean, standard deviation, min, max

    Parameters
    ----------
    df : Pandas Dataframe
        Dataframe containing the hourly step count of an individual. The dataframe must be date time index.
    config: dict, optional
        Dictionary keys containing optional arguments. These can be:

        value_col: str.
            Column contains step values. Default value is "values".
        user_id: list. Optional
            List of user id. If none given, returns summary for all users.
        start_date: string. Optional
            Start date of time segment used for computing the summary. If not given, acquire summary for the whole time range.
        end_date: string.  Optional
            End date of time segment used for computing the summary. If not given, acquire summary for the whole time range.
        
    Returns
    -------
    summary_df: pandas DataFrame
        A dataframe containing user id and associated step summary.
    """

    assert 'user' in df.columns, 'User column does not exist'
    assert df.index.inferred_type == 'datetime64', "Dataframe must have a datetime index"

    if user_id is not None:
        assert isinstance(user_id, list), 'User id must be a list'
        df = df[df['user'] in user_id]

    if start_date is not None and end_date is not None:
        df = df[start_date:end_date]
    elif start_date is None and end_date is not None:
        df = df[:end_date]
    elif start_date is not None and end_date is None:
        df = df[start_date:]

    df['month'] = df.index.month
    df['day'] = df.index.day

    # Calculate sum of steps for each date
    df['daily_sum'] = util.group_data( df,
        ['day', 'month']
    )[value_col].transform('sum')

    # Under the assumption that a user cannot have zero steps per day, we remove rows where daily_sum are zero
    df = df[~(df.daily_sum == 0)]

    summary_df = pd.DataFrame()
    
    summary_df['median_sum_step'] = util.group_data(df)['daily_sum'].median()
    summary_df['avg_sum_step'] = util.group_data(df)['daily_sum'].mean()
    summary_df['std_sum_step'] = util.group_data(df)['daily_sum'].std()
    summary_df['min_sum_step'] = util.group_data(df)['daily_sum'].min()
    summary_df['max_sum_step'] = util.group_data(df)['daily_sum'].max()

    summary_df = util.reset_groups(summary_df)
    summary_df = util.select_columns(summary_df, 
        ["median_sum_step", "avg_sum_step", "std_sum_step", "min_sum_step", "max_sum_step"]
    )
    return summary_df




[docs]
def tracker_step_distribution(steps_df, steps_column='steps', resample_args={'rule': 'h'}, timeframe='d', **kwargs):
    """Return distribution of steps within a time range.
    The number of step is sampled according to the frequency rule in resample_args.
    This is divided by the total number of steps in a larger time frame, given by
    the timeframe argument.

    Using default parameters produces a daily step distribution.

    Parameters
    ----------
    steps_df : Pandas Dataframe
        Dataframe the step distribution of each individual.
    config: dict, optional
        Dictionary keys containing optional arguments. These can be:

        steps_column: str. Optional
            Column contains step values. Default value is "steps".
        resample_args: dict. Optional
            Dictionary containing the resample arguments. Default value is {'rule': 'h'}.
        timeframe: string. Optional
            Time frame used for computing the distribution. Default value is 'D'.
        
    Returns
    -------
    df: pandas DataFrame
        A dataframe containing the distribution of step count.
    """
    assert isinstance(steps_df, pd.DataFrame), "df_u is not a pandas dataframe"

    # time frame must be longer than resample_args["rule"]
    to_offset = pd.tseries.frequencies.to_offset
    if to_offset(timeframe) <= to_offset(resample_args["rule"]):
        raise ValueError("Time frame must be longer than resample rule")

    # Extract date and time columns from timestamp
    df = steps_df.copy()

    # Remove duplicates
    df = df.drop_duplicates(subset=['user', 'date', 'time'], keep='last')

    # Convert the absolute values into distribution. This can be understood as the
    # portion of steps the users took during each hour
    steps = df.groupby(["user"]).resample(**resample_args, include_groups=False).agg({steps_column: 'sum'})
    step_sum = steps.reset_index(["user"]).groupby(["user"]).resample(timeframe).agg({steps_column: 'sum'})

    steps["step_sum"] = step_sum[steps_column]
    # fill down
    steps["step_sum"] = steps["step_sum"].ffill()

    # Divide hourly steps by daily sum to get the distribution
    steps['step_distribution'] = steps[steps_column] / steps['step_sum']

    # Set index and select columns
    steps = util.select_columns(steps, ["step_distribution", "step_sum"])
    return steps[["step_distribution", "step_sum"]]



ALL_FEATURES = [globals()[name] for name in globals()
                         if name.startswith('tracker_')]
ALL_FEATURES = {x: {} for x in ALL_FEATURES}


[docs]
def extract_features_tracker(df, features=None):
    """ This function computes and organizes the selected features for tracker data
        recorded using Polar Ignite.

        The complete list of features that can be calculated are: tracker_daily_step_distribution

        Parameters
        ----------
        df: pandas.DataFrame
            Input data frame
        features: dict, optional
            Dictionary keys contain the names of the features to compute.
            The value of the keys is the list of parameters that will be passed to the function.
            If none is given, all features will be computed.

        Returns
        -------
        result: dataframe
            Resulting dataframe
        """
    assert isinstance(df, pd.DataFrame), "Please input data as a pandas DataFrame type"

    computed_features = []
    if features is None:
        features = ALL_FEATURES
    for feature_function, kwargs in features.items():
        print(features, kwargs)
        computed_feature = feature_function(df, **kwargs)
        index_by = list(set(group_by_columns) & set(computed_feature.columns))
        computed_feature = computed_feature.set_index(index_by, append=True)
        computed_features.append(computed_feature)

    computed_features = pd.concat(computed_features, axis=1)

    if 'group' in df:
        computed_features['group'] = df.groupby('user')['group'].first()

    computed_features = util.reset_groups(computed_features)
    return computed_features