Source code for streamline.utils.dataset

import csv
import logging
import os

import pandas as pd


[docs]class Dataset:
    def __init__(self, dataset_path, class_label, match_label=None, instance_label=None):
        """
        Creates dataset with path of tabular file

        Args:
            dataset_path: path of tabular file (as csv, tsv, or txt)
            class_label: column label for the outcome to be predicted in the dataset
            match_label: column to identify unique groups of instances in the dataset \
            that have been 'matched' as part of preparing the dataset with cases and controls \
            that have been matched for some co-variates \
            Match label is really only used in the cross validation partitioning \
            It keeps any set of instances with the same match label value in the same partition.
            instance_label: Instance label is mostly used by the rule based learner in modeling, \
            we use it to trace back heterogeneous subgroups to the instances in the original dataset

        """
        self.data = None
        self.path = dataset_path
        self.name = self.path.split('/')[-1].split('.')[0]
        self.format = self.path.split('/')[-1].split('.')[-1]
        self.class_label = class_label
        self.match_label = match_label
        self.instance_label = instance_label
        self.load_data()

[docs]    def load_data(self):
        """
        Function to load data in dataset
        """
        logging.info("Loading Dataset: " + str(self.name))
        if self.format == 'csv':
            self.data = pd.read_csv(self.path, na_values='NA', sep=',')
        elif self.format == 'tsv':
            self.data = pd.read_csv(self.path, na_values='NA', sep='\t')
        elif self.format == 'txt':
            self.data = pd.read_csv(self.path, na_values='NA', sep=' ')
        else:
            raise Exception("Unknown file format")

        if not (self.class_label in self.data.columns):
            raise Exception("Class label not found in file")
        if self.match_label and not (self.match_label in self.data.columns):
            raise Exception("Match label not found in file")
        if self.instance_label and not (self.instance_label in self.data.columns):
            raise Exception("Instance label not found in file")

[docs]    def feature_only_data(self):
        """
        Create features-only version of dataset for some operations
        Returns: dataframe x_data with only features

        """

        if self.instance_label is None and self.match_label is None:
            x_data = self.data.drop([self.class_label], axis=1)  # exclude class column
        elif self.instance_label is not None and self.match_label is None:
            x_data = self.data.drop([self.class_label, self.instance_label], axis=1)  # exclude class column
        elif self.instance_label is None and self.match_label is not None:
            x_data = self.data.drop([self.class_label, self.match_label], axis=1)  # exclude class column
        else:
            x_data = self.data.drop([self.class_label, self.instance_label, self.match_label],
                                    axis=1)  # exclude class column
        return x_data

[docs]    def non_feature_data(self):
        """
        Create non features version of dataset for some operations
        Returns: dataframe y_data with only non features

        """
        if self.instance_label is None and self.match_label is None:
            y_data = self.data[[self.class_label]]
        elif self.instance_label is not None and self.match_label is None:
            y_data = self.data[[self.class_label, self.instance_label]]
        elif self.instance_label is None and self.match_label is not None:
            y_data = self.data[[self.class_label, self.match_label]]
        else:
            y_data = self.data[[self.class_label, self.instance_label, self.match_label]]
        return y_data

[docs]    def get_outcome(self):
        """
        Function to get outcome value form data
        Returns: outcome column

        """
        return self.data[self.class_label]

[docs]    def clean_data(self, ignore_features):
        """
        Basic data cleaning: Drops any instances with a missing outcome
        value as well as any features (ignore_features) specified by user
        """
        # Remove instances with missing outcome values
        self.data = self.data.dropna(axis=0, how='any', subset=[self.class_label])
        self.data = self.data.reset_index(drop=True)
        self.data[self.class_label] = self.data[self.class_label].astype(dtype='int8')
        # Remove columns to be ignored in analysis
        if ignore_features:
            self.data = self.data.drop(ignore_features, axis=1)

[docs]    def set_headers(self, experiment_path, phase='exploratory'):
        """
        Exports dataset header labels for use as a reference later in the pipeline.

        Returns: list of headers labels
        """
        # Get Original Headers
        if not os.path.exists(experiment_path + '/' + self.name + '/' + phase):
            os.makedirs(experiment_path + '/' + self.name + '/' + phase)
        headers = self.data.columns.values.tolist()
        with open(experiment_path + '/' + self.name + '/' + phase + '/OriginalFeatureNames.csv', mode='w',
                  newline="") as file:
            writer = csv.writer(file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerow(headers)
        return headers