"""Read data from various formats, user entery point.
This module contains various functions `read_*` which load data from different
formats into pandas.DataFrame:s. As a side effect, it provides the
authoritative information on how incoming data is converted to dataframes.
"""
import pandas as pd
import json
mHealth_duration_units = {
"ps": "picoseconds",
"ns": "nanoseconds",
"us": "microseconds",
"ms": "milliseconds",
"sec": "seconds",
"min": "minutes",
"h": "hours",
"d": "days",
"wk": "weeks",
"Mo": "months",
"yr": "years",
}
[docs]
def duration_to_timedelta(df, duration_col):
''' Format a duration entry in the mHealth format. Duration
is a dictionary that contains a value and a unit. The
dataframe should contain two columns, DURATION_COL_NAME.value
and DURATION_COL_NAME.unit.
Returns a dataframe with a DURATION_COL_NAME column containing a
a timedelta object. The original columns will be dropped.
'''
duration_value_col = f'{duration_col}.value'
duration_unit_col = f'{duration_col}.unit'
def format_duration_row(row):
unit = mHealth_duration_units[row[duration_unit_col]]
value = row[duration_value_col]
if unit == "picoseconds": # DateOffset doesn't support picoseconds
unit = "nanoseconds"
value *= 0.001
return pd.to_timedelta(value, unit=unit)
if duration_value_col in df.columns:
# Format duration as DateOffset. We use this below to calculate either start of end
rows = ~df[duration_value_col].isnull()
df.loc[rows, duration_col] = df.loc[rows].apply(format_duration_row, axis=1)
df = df.drop([duration_value_col, duration_unit_col], axis=1)
return df
[docs]
def total_sleep_time(data_list):
'''
Format mHealth total sleep data from json formatted data to a Niimpy
compatible DataFrame. The DataFrame contains the columns
- total_sleep_time : The total sleep time measurement
- total_sleep_time_unit : The unit the measuremnet is expressed in
- measurement interval columns
- possible descriptive statistics columns
The measurement interval column are either
- start : start time of the measurement interval
- end : end time of the measurement interval
or
- date : the date of the measurement
- part_of_day : the time of day the measurement was made
The descriptive statistics columns would be
- descriptive_statistics : Describes how the measurement is calculated
- descriptive_statistics_denominator : Time interval the above desciption refers to.
The dataframe is indexed by "timestamp", which is either the "start" or the "date".
Parameters
----------
data_list: list of dictionaries
MHealth formatted sleep duration data loaded with json.load()
Returns
-------
data: A pandas.DataFrame containing sleep duration data
'''
df = pd.json_normalize(data_list)
df = duration_to_timedelta(df, "total_sleep_time")
df = format_time_interval(df, "effective_time_frame.time_interval")
df["timestamp"] = pd.to_datetime(df["timestamp"], utc=True)
df.set_index('timestamp', inplace=True)
return df
[docs]
def total_sleep_time_from_file(filename):
''' Read mHealth total sleep time from a file and convert into a Niimpy
compatible DataFrame using total_sleep_time. The dataframe
contains the columns
- total_sleep_time : The total sleep time measurement
- total_sleep_time_unit : The unit the measuremnet is expressed in
- measurement interval columns
- possible descriptive statistics columns
The measurement interval column are either
- start : start time of the measurement interval
- end : end time of the measurement interval
or
- date : the date of the measurement
- part_of_day : the time of day the measurement was made
The descriptive statistics columns would be
- descriptive_statistics : Describes how the measurement is calculated
- descriptive_statistics_denomirator : Time interval the above
description refers to.
The dataframe is indexed by "timestamp", which is either the "start" or the "date".
Parameters
----------
filename: string
Path to the file containing mhealth formatted sleep duration data.
Returns
-------
data: A pandas.DataFrame containing sleep duration data
'''
with open(filename) as f:
data = json.load(f)
df = total_sleep_time(data)
return df
[docs]
def heart_rate(data_list):
''' Format the heart rate json data into a niimpy dataframe.
The dataframe contains the columns
- heart_rate : Heart rate measurement in beats per minute
- (optional) time interval columns
- (optional) descriptive statistics column, a string
- (optional) temporal relationship to sleep column, a string
- (optional) temporal relationship to physical activity column, a
string
Measurement time or interval columns. If exact time is given, only the
index is set. If a time interval is given, we set two additional columns
- start : start time of the measurement interval
- end : end time of the measurement interval
and set the index to the start time.
The descriptive statistics column describes how the value is calculated
over the given time interval. For example, "average" would denote a mean
over the time period.
The temporal relationship to sleep is one of "before sleeping", "during
sleep" or "on waking".
The temporal relationship to physical activity is one of "at rest",
"active", "before exercise", "after exercise" or "during exercise".
Parameters
----------
data_list: list of dictionaries
MHealth formatted heart rate data loaded using json.load().
Returns
-------
data: A pandas.DataFrame containing geolocation data
'''
df = pd.json_normalize(data_list)
# The mhealth standard specifies the unit is always beats per minute.
# We can drop the unit column without converting the value
df.drop("heart_rate.unit", axis=1, inplace=True)
df = df.rename(columns={
"heart_rate.value": "heart_rate",
})
# Each sample contains a time_frame, which is either a duration or a
# date-time string
rows = ~df["effective_time_frame.date_time"].isna()
df.loc[rows, "timestamp"] = pd.to_datetime(df["effective_time_frame.date_time"], utc=True)
df = format_time_interval(df, "effective_time_frame.time_interval")
df.set_index('timestamp', inplace=True)
return df
[docs]
def heart_rate_from_file(filename):
'''Read mHealth formatted heart rate data from a file and convert it to
a Niimpy compatible dataframe.
The dataframe contains the columns
- heart_rate : Heart rate measurement in beats per minute
- (optional) time interval columns
- (optional) descriptive statistics column, a string
- (optional) temporal relationship to sleep column, a string
- (optional) temporal relationship to physical activity column, a
string
Measurement time or interval columns. If exact time is given, only the
index is set. If a time interval is given, we set two additional columns
- start : start time of the measurement interval
- end : end time of the measurement interval
and set the index to the start time.
The descriptive statistics column describes how the value is calculated
over the given time interval. For example, "average" would denote a mean
over the time period.
The temporal relationship to sleep is one of "before sleeping", "during
sleep" or "on waking".
The temporal relationship to physical activity is one of "at rest",
"active", "before exercise", "after exercise" or "during exercise".
Parameters
----------
filename: string
Path to the file containing mhealth formatted heart rate data.
Returns
-------
data: A pandas.DataFrame containing heart rate data
'''
with open(filename) as f:
data = json.load(f)
df = heart_rate(data)
return df
[docs]
def geolocation(data_list):
''' Format the geolocation json data into a niimpy dataframe.
Parameters
----------
data_list: list of dictionaries
MHealth formatted geolocation data loaded using json.load().
Returns
-------
data: A pandas.DataFrame containing geolocation data
'''
df = pd.json_normalize(data_list)
# Keep rows where latitude and longitude are given correctly
df = df[df["latitude.unit"] == "deg"]
df = df[df["longitude.unit"] == "deg"]
df.rename(columns={
"latitude.value": "latitude",
"longitude.value": "longitude",
}, inplace=True)
return df
[docs]
def geolocation_from_file(filename):
'''Read mHealth formatted geolocation data from a file and convert it to
a Niimpy compatible dataframe.
Parameters
----------
filename: string
Path to the file containing mhealth formatted geolocation data.
Returns
-------
data: A pandas.DataFrame containing geolocation data
'''
with open(filename) as f:
data = json.load(f)
df = geolocation(data)
return df