Source code for niimpy.preprocessing.application

import numpy as np
import pandas as pd

from niimpy.preprocessing import battery as b
from niimpy.preprocessing import screen as s
from niimpy.preprocessing import util


MAP_APP = {
    "CrossCycle": "sports",
    "Runtastic": "sports",
    "Polar Flow": "sports",
    "Pedometer - Step Counter": "sports",
    "STAMINA-tila": "sports",
    "Fit": "sports",
    "Modo STAMINA": "sports",
    "7 MINUTE WORKOUT": "sports",
    "Moves": "sports",
    "Six Pack in 30 Days": "sports",
    "Bodyweight": "sports",
    "Sports Tracker": "sports",
    "Fit": "sports",
    "Sports Tracker": "sports",
    "Pedometer Step Counter": "sports",
    "MyFitnessPal": "sports",
    "Endomondo": "sports",
    "Health Mate": "sports",
    "Upsi": "sports",
    "Mahjong": "games",
    "Solitaire": "games",
    "Solitaire Collection": "games",
    "Paradise Island 2": "games",
    "Steam": "games",
    "Hidden City": "games",
    "Dokkan Battle": "games",
    "Super Mario Run": "games",
    "Candy Crush Saga": "games",
    "Jeopardy!": "games",
    "Clash Royale": "games",
    "Calcy IV": "games",
    "QuizTaisto PREMIUM": "games",
    "PlayStation": "games",
    "Cleopatra Jewels": "games",
    "DraStic": "games",
    "XCOM": "games",
    "The Secret Society": "games",
    "Pokémon GO": "games",
    "Hearthstone": "games",
    "I Love Hue": "games",
    "Destiny": "games",
    "Castle Cats": "games",
    "Pocket Camp": "games",
    "Hatch": "games",
    "1010!": "games",
    "AirConsole": "games",
    "Sanapala": "games",
    "Head Ball 2": "games",
    "Pokémon GO": "games",
    "Wordfeud FREE": "games",
    "Pyramid Solitaire Saga": "games",
    "Match and Explore": "games",
    "Twitch": "games",
    "Mahjong": "games",
    "Solitaire": "games",
    "Pokémon GO": "games",
    "Game Launcher": "games",
    "Hay Day": "games",
    "State of Survival": "games",
    "Wordfeud Free": "games",
    "Sähköposti": "comm",
    "Gmail": "comm",
    "Puhelin – puheluiden hallinta": "comm",
    "Teléfono": "comm",
    "Email": "comm",
    "Outlook": "comm",
    "Skype": "comm",
    "Romantic love messages": "comm",
    "Dialer": "comm",
    "Discord": "comm",
    "WhatsApp": "comm",
    "Telegram": "comm",
    "Phone": "comm",
    "Teléfono": "comm",
    "Messages": "comm",
    "Messenger Lite": "comm",
    "Puhelin": "comm",
    "Mensajería": "comm",
    "Numerovalitsin": "comm",
    "Messenger": "comm",
    "LINE": "comm",
    "Dual Messenger": "comm",
    "Telegeram": "comm",
    "Mensajería": "comm",
    "Googlen tekstistä puheeksi -moottori": "comm",
    "LINE Camera": "comm",
    "Signal": "comm",
    "Viber": "comm",
    "Viestit": "comm",
    "Amino": "comm",
    "Fonecta Caller": "comm",
    "ICE - In Case of Emergency": "comm",
    "Orbot": "comm",
    "Puhelu": "comm",
    "Puhelutallennin": "comm",
    "Kuvat": "utility",
    "TikTok": "comm",
    "Sähköposti": "comm",
    "MysticMessenger": "comm",
    "Pinterest": "socialmedia",
    "Tumblr": "socialmedia",
    "Snapchat": "socialmedia",
    "Twitter": "socialmedia",
    "Hootsuite": "socialmedia",
    "We Heart It": "socialmedia",
    "Instagram": "socialmedia",
    "Jodel": "socialmedia",
    "happn": "socialmedia",
    "LinkedIn": "socialmedia",
    "Facebook": "socialmedia",
    "Tinder": "socialmedia",
    "SDP Kansalaispaneeli": "socialmedia",
    "Grindr": "socialmedia",
    "ROMEO UNCUT": "socialmedia",
    "Geo News": "news",
    "Helsingin Sanomat": "news",
    "Yle Areena": "news",
    "Uutisvahti": "news",
    "Flipboard": "news",
    "Kauppalehti": "news",
    "Ilta-Sanomat": "news",
    "Iltalehti": "news",
    "mtv": "news",
    "upday": "news",
    "MTV Uutiset": "news",
    "Sää": "news",
    "Weather": "news",
    "Booking.com Hotellit": "travel",
    "Airbnb": "travel",
    "Booking.com": "travel",
    "TripAdvisor": "travel",
    "Couchsurfing": "travel",
    "Bonusway": "travel",
    "TUI Suomi": "travel",
    "Norwegian": "travel",
    "Booking.com": "travel",
    "OPSkin": "shop",
    "Iso Omena": "shop",
    "Lunchie Market": "shop",
    "AliExpress": "shop",
    "Frank App": "shop",
    "Hesburger": "shop",
    "MobilePay": "shop",
    "Zalando": "shop",
    "WeShare": "shop",
    "Wish": "shop",
    "eBay": "shop",
    "Aktia Wallet": "shop",
    "S-mobiili": "shop",
    "Klarna": "shop",
    "PINS": "shop",
    "McDonalds": "shop",
    "K-Ruoka": "shop",
    "Wrapp": "shop",
    "Wolt": "shop",
    "Ticketmaster": "shop",
    "H&M": "shop",
    "EspressoHouse": "shop",
    "ResQ Club": "shop",
    "Momotoko": "shop",
    "Pivo": "shop",
    "Lunchie Market": "shop",
    "EspressoHouse": "shop",
    "Sheets": "work",
    "Slack": "work",
    "My Files": "work",
    "Dropbox": "work",
    "Moodle": "work",
    "Knudge.me": "work",
    "Wilma": "work",
    "Docs": "work",
    "Zoom": "work",
    "Teams": "work",
    "KDE Connect": "work",
    "Linkity Pro": "work",
    "Timely": "work",
    "OneDrive": "work",
    "Uber": "transport",
    "VR Lähijunat": "transport",
    "HSL": "transport",
    "HSL Mobiililippu": "transport",
    "CityTrack": "transport",
    "Podcast Player": "leisure",
    "Samsung Music": "leisure",
    "Google Play Music": "leisure",
    "Shazam": "leisure",
    "Photos": "leisure",
    "Player FM": "leisure",
    "Crowst": "leisure",
    "Leffapeli": "leisure",
    "WEBTOON": "leisure",
    "Tarot Reading": "leisure",
    "Duolingo": "leisure",
    "Crunchyroll": "leisure",
    "SoundHound": "leisure",
    "LiveTulokset": "leisure",
    "Youtify": "leisure",
    "Kuvakaappaus": "leisure",
    "Tarot Universe": "leisure",
    "Norstat": "leisure",
    "Enkeli-tarot": "leisure",
    "Podcast Republic": "leisure",
    "Audiobooks": "leisure",
    "9GAG": "leisure",
    "Netflix": "leisure",
    "Pornhub": "leisure",
    "Musiikki": "leisure",
    "YouTube": "leisure",
    "Imgur": "leisure",
    "Google-sovellus": "leisure",
    "Chrome": "leisure",
    "YouTube Music": "leisure",
    "Peel Remote": "leisure",
    "Music Center": "leisure",
    "SoundCloud": "leisure",
    "Spotify": "leisure",
    "Google Play Musiikki": "leisure",
    "MadLipz": "leisure",
    "HAVEN KBH": "leisure",
    "Internet": "leisure",
    "Podcast Go": "leisure",
    "TuneIn Radio": "leisure",
    "pixiv": "leisure",
    "Pic Collage": "leisure",
    "Radio": "leisure",
    "myTuner Free": "leisure",
    "Audiobooks": "leisure",
    "FaceApp": "leisure",
    "Podcast Republic": "leisure",
    "Libby": "leisure",
    "Headspace": "leisure",
    "BookBeat": "leisure",
    "Edge": "leisure",
    "Google": "leisure",
    "Nextory": "leisure",
    "Android System": "system",
    "Android system": "system",
    "Android-järjestelmä": "system",
    "Android-süsteem": "system",
    "Download Manager": "system",
    "Järj. UI": "system",
    "Käyttöliitt.": "system",
    "Latauksen hallinta": "system",
    "Lataustenhallinta": "system",
    "System UI": "system",
    "Ohjelmistopäivitys": "system",
    "Optimoija": "system",
    "Avast Mobile Security": "security",
    "Elisa Turvapaketti": "security",
    "F-Secure SAFE": "security",
    "Freedome": "security",
    "Telia Turvapaketti": "security",
    "McAfee Security": "security",
    "Camera": "utility",
    "Clock": "utility",
    "Galleria": "utility",
    "Google Play Kauppa": "utility",
    "Google Play Palvelut": "utility",
    "Google Play Store": "utility",
    "Galaxy Store": "utility",
    "Kalenteri": "utility",
    "Kamera": "utility",
    "Kello": "utility",
    "Kuvat": "utility",
    "Smartâ\\x80\\x8bThings": "utility",
    "Maps": "utility",
    "Samsung capture": "utility",
    "Daylio": "wellbeing",
    "MoMoMood": "wellbeing",
}


[docs] def classify_app(df, app_column_name = "application_name", group_map = MAP_APP, **kwargs): """This function is a helper function for other screen preprocessing. The function classifies the screen events into the groups specified by group_map. Parameters ---------- df: pandas.DataFrame Input data frame config: dict, optional Dictionary keys containing optional arguments for the computation of screen information. The following arguments are used: app_column_name: Column containing the app name. Defaults to 'application_name'. group_map: A dictionary mapping the app names to app groups. (required) (e.g. 'my_app':'my_app_group') Returns ------- df: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe." df["app_group"] = "na" for key, value in group_map.items(): df.loc[df[app_column_name] == key, "app_group"] = value return df
[docs] def app_count(df, bat=None, screen=None, group_map = MAP_APP, resample_args = {"rule":"30min"}, **kwargs): """This function returns the number of times each app group has been used, within the specified timeframe. The app groups are defined as a dictionary within the config variable. Examples of app groups are social media, sports, games, etc. If no mapping is given, a default one will be used. If no resampling window is given, the function sets a 30 min default time window. The function aggregates the duration by user, by app group, by timewindow. Parameters ---------- df: pandas.DataFrame Input data frame bat: pandas.DataFrame, optional Dataframe with the battery information. If no data is available, an empty dataframe should be passed. screen: pandas.DataFrame, optional Dataframe with the screen information. If no data is available, an empty dataframe should be passed. config: dict, optional Dictionary keys containing optional arguments for the computation of screen information. The following arguments are used: app_column_name: Column containing the app name. Defaults to 'application_name'. group_map: A dictionary mapping the app names to app groups. Defaults to niimpy.preprocesing.application.MAP_APP, which maps several common apps. screen_column_name: Column containing the screen status. Defaults to 'screen_status'. resample_args: parameteres passed to pandas.DataFrame.resample. Defaults to {'rule':'30min'}. Returns ------- result: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "Please input data as a pandas DataFrame type" bat = util.ensure_dataframe(bat) screen = util.ensure_dataframe(screen) df2 = classify_app(df, group_map = group_map, **kwargs) # Insert missing data due to the screen being off or battery depleated if not screen.empty: screen = s.screen_off(screen, bat, **kwargs) if type(screen.index) == pd.MultiIndex: screen.reset_index(inplace=True) screen.set_index("index", inplace=True) df2 = pd.concat([df2, screen]) df2.sort_values(by=["user", "device", "datetime"], inplace=True) df2.fillna({"app_group": "off"}, inplace=True) if screen.empty and not bat.empty: shutdown = b.shutdown_info(bat, **kwargs) shutdown = shutdown.replace([-1, -2], "off") if type(shutdown.index) == pd.MultiIndex: shutdown.reset_index(inplace=True) shutdown.set_index("index", inplace=True) df2 = pd.concat([df2, shutdown]) df2.sort_values(by=["user", "device", "datetime"], inplace=True) df2.fillna({"app_group": "off"}, inplace=True) keep_columns = list(set(["user", "device", "group"]) & set(df.columns)) df2 = df2[keep_columns+["datetime", "app_group", "application_name"]] df2.dropna(inplace=True) if len(df2) > 0: df2["datetime"] = pd.to_datetime(df2["datetime"]) df2.set_index("datetime", inplace=True) result = util.group_data(df2, "app_group")["app_group"].resample(**resample_args, include_groups=False).count() result = pd.DataFrame(result).rename(columns={"app_group": "count"}) result = util.reset_groups(result, "app_group") result = util.select_columns(result, ["app_group", "count"]) return result return None
[docs] def app_duration(df, bat=None, screen=None, group_map = MAP_APP, resample_args = {"rule":"30min"}, outlier_threshold = "10h", **kwargs): """This function returns the duration of use of different app groups, within the specified timeframe. The app groups are defined as a dictionary within the config variable. Examples of app groups are social media, sports, games, etc. If no mapping is given, a default one will be used. If no resampling window is given, the function sets a 30 min default time window. The function aggregates the duration by user, by app group, by timewindow. Parameters ---------- df: pandas.DataFrame Input data frame bat: pandas.DataFrame, optional Dataframe with the battery information. If no data is available, an empty dataframe should be passed. screen: pandas.DataFrame, optional Dataframe with the screen information. If no data is available, an empty dataframe should be passed. config: dict, optional Dictionary keys containing optional arguments for the computation of scrren information. The following arguments are used: app_column_name: Column containing the app name. Defaults to 'application_name'. group_map: A dictionary mapping the app names to app groups. Defaults to niimpy.preprocesing.application.MAP_APP, which maps several common apps. outlier_threshold: Threshold for filtering out outliers. Defaults to '10h'. Returns ------- result: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "Please input data as a pandas DataFrame type" bat = util.ensure_dataframe(bat) screen = util.ensure_dataframe(screen) niimpy_cols = list(set(["group", "user", "device"]) & set(df.columns)) df2 = classify_app(df, group_map = group_map, **kwargs) # Insert missing data due to the screen being off or battery depleated if not screen.empty: screen = s.screen_off(screen, bat, **kwargs) if type(screen.index) == pd.MultiIndex: screen.reset_index(inplace=True) screen.set_index("index", inplace=True) df2 = pd.concat([df2, screen]) df2.sort_values(by=niimpy_cols + ["datetime"], inplace=True) df2.fillna({"app_group": "off"}, inplace=True) if screen.empty and not bat.empty: shutdown = b.shutdown_info(bat, **kwargs) shutdown = shutdown.replace([-1, -2], "off") if type(shutdown.index) == pd.MultiIndex: shutdown.reset_index(inplace=True) shutdown.set_index("index", inplace=True) df2 = pd.concat([df2, shutdown]) df2.sort_values(by=niimpy_cols + ["datetime"], inplace=True) df2.fillna({"app_group": "off"}, inplace=True) keep_columns = list(set(["group", "user", "device"]) & set(df.columns)) df2 = df2[keep_columns+["time", "datetime", "app_group"]] # Fill in time gap between app foreground session def resample_group(group): rule = resample_args["rule"] all_times = pd.date_range( start=group.index.min().round(rule), end=group.index.max().ceil(rule), freq=rule, ) new_df = pd.DataFrame(index=all_times) resampled_group = group.join(new_df, how="outer").ffill() resampled_group["datetime"] = resampled_group.index return resampled_group # Apply resampling to each group df2 = util.group_data(df2).apply(resample_group, include_groups=False) df2 = util.reset_groups(df2) print(df2.shape) df2["duration"] = np.nan df2["duration"] = df2["datetime"].diff() df2["duration"] = df2["duration"].shift(-1) # Filter outliers by duration. Default threshold is 10 hours. outlier_threshold = pd.Timedelta(outlier_threshold) df2 = df2[~(df2.duration > outlier_threshold)] df2["duration"] = df2["duration"].dt.total_seconds() df2 = df2[~(df2.duration <= 0)] df2.dropna(inplace=True) if len(df2) > 0: df2["datetime"] = pd.to_datetime(df2["datetime"]) df2.set_index("datetime", inplace=True) result = util.group_data(df2, "app_group")["duration"].resample(**resample_args, include_groups=False).sum() result = pd.DataFrame(result).rename(columns={"app_group": "count"}) df2 = util.reset_groups(result, "app_group") df2 = util.select_columns(df2, ["app_group", "duration"]) return df2 return None
ALL_FEATURES = [globals()[name] for name in globals() if name.startswith("app_")] ALL_FEATURES = {x: {} for x in ALL_FEATURES}
[docs] def extract_features_app(df, bat=None, screen=None, features=None): """This function computes and organizes the selected features for application events. The function aggregates the features by user, by app group, by time window. If no time window is specified, it will automatically aggregate the features in 30 mins non-overlapping windows. If no group_map is provided, a default one will be used. The complete list of features that can be calculated are: app_count, and app_duration. Parameters ---------- df: pandas.DataFrame Input data frame features: dict, optional Dictionary keys contain the names of the features to compute. If none is given, all features will be computed. Returns ------- result: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "Please input data as a pandas DataFrame type" bat = util.ensure_dataframe(bat) screen = util.ensure_dataframe(screen) if features is None: features = ALL_FEATURES else: assert isinstance(features, dict), "Please input the features as a dictionary" computed_features = [] for feature, feature_arg in features.items(): computed_feature = feature(df, bat, screen, **feature_arg) computed_feature = util.set_conserved_index(computed_feature, "app_group") computed_features.append(computed_feature) computed_features = pd.concat(computed_features, axis=1) # index the result only by the original index (datetime) computed_features = util.reset_groups(computed_features, "app_group") return computed_features