Source code for niimpy.reading.read

"""Read data from various formats, user entery point.

This module contains various functions `read_*` which load data from different
formats into pandas.DataFrame:s.  As a side effect, it provides the
authoritative information on how incoming data is converted to dataframes.

"""

import pandas as pd
import warnings
import json

from niimpy.reading import database
from niimpy.preprocessing import util

def _read_preprocess(df, add_group=None):
    """Standard preprocessing arguments when reading.

    This is a preprocessing filter which handles some standard arguments
    when reading files.  This should be considered a private, unstable
    function.


    Parameters
    ----------

    df: pandas.DataFrame

        Input data frame

    add_group: string, optional

        If given, add a new 'group' column with all values set to this
        given identifier.


    Returns
    -------

    df: dataframe

        Resulting dataframe (modified in-place if possible, but may also
        be a copy)

    """
    if add_group is not None:
        df['group'] = add_group
        #df['group'] = df['group'].astype('category')
        #pd.Categorical(add_group)
    return df


[docs] def read_sqlite(filename, table, add_group=None, user=database.ALL, limit=None, offset=None, start=None, end=None, tz=None): """Read DataFrame from sqlite3 database This will read data from a sqlite3 file, taking sensor data in a given table, and optionally apply various limits. Parameters ---------- filename : str filename of sqlite3 database table : str table name of data within the database add_group : object If given, add a 'group' column with all values set to this. user : str or database.ALL, optional If given, return only data matching this user (based an column 'user') limit : int, optional If given, return only this many rows offset : int, optional When used with limit, skip this many lines at the beginning start : int or float or str or datetime.datetime, optional If given, limit to this starting time. Formats can be int/float (unixtime), string (parsed with dateutil.parser.parser, or datetime.datetime. end : int or float or str or datetime.datetime, optional Same meaning as 'start', but for end time """ if tz is None: warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) db = database.Data1(filename, tz=tz) df = db.raw(table, user, limit=limit, offset=offset, start=start, end=end) df = _read_preprocess(df, add_group=add_group) return df
[docs] def read_sqlite_tables(filename): """Return names of all tables in this database Return a set of all tables contained in this database. This may be useful when you need to see what data is available within a database. """ db = database.Data1(filename) return db.tables()
def _get_dataframe(df_or_database, table, user=None): """Read from database or directly use DataFrame Functions used to accept a database only, now the standard is dataframe. This provides some backwards compatability between the old and new systems: DataFrames are used as-is, but if a database is given, it extracts the right information out of the table (and does what the database used to do to filter by user). This function could also be used to transparently accept other types of data inputs. If input is: - atabase: extract the given table/user using .raw() and return A typical usage is:: def function(df): # 'df' could be a DataFrame or database df = _get_dataframe(df, 'TableName') # 'df' is now always a DataFrame Returns ------- df : DataFrame (same one if possible) """ if isinstance(df_or_database, database.Data1): df = df_or_database.raw(table=table, user=subject) else: df = df_or_database # questions was *not* dataframe. if user is not None and user is not database.ALL: df = df[df['user'] == user] return df
[docs] def read_csv(filename, read_csv_options={}, add_group=None, tz=None): """Read DataFrame from csv file This will read data from a csv file and then process the result with `niimpy.util.df_normalize`. Parameters ---------- filename : str filename of csv file read_csv_options: dict Dictionary of options to pandas.read_csv, if this is necessary for custom csv files. add_group : object If given, add a 'group' column with all values set to this. """ if tz is None: warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) df = pd.read_csv(filename, **read_csv_options) # df_normalize converts sets the index to time values and does other time # conversions. Inplace. util.df_normalize(df, tz=tz) df = _read_preprocess(df, add_group=add_group) return df
[docs] def read_csv_string(string, tz=None): """Parse a string containing CSV and return dataframe This should not be used for serious reading of CSV from disk, but can be useful for tests and examples. Various CSV reading options are turned on in order to be better for examples: - Allow comments in the CSV file - Remove the `datetime` column (redundant with `index` but some older functions break without it, so default readers need to leave it). Parameters ---------- string : string containing CSV file Returns ------- df: pandas.DataFrame """ if tz is None: warnings.warn(DeprecationWarning("From now on, you should explicitely specify timezone with e.g. tz='Europe/Helsinki'"), stacklevel=2) import io df = read_csv(io.StringIO(string), tz=tz, read_csv_options={ 'comment': '#', }, ) if 'datetime' in df.columns: del df['datetime'] return df