Tuesday, December 24, 2019

Python study notes 3: k-fold cross-validation, traditional regression in Python, Sparsity matrix in Python

Simple example using cross-validation in python
Common useful package/library/module for machine learning
Using Dummy Coding for categorical varaibles in Regression example
How do we use sparsity matrix to save memory for large design matrix
How do we use generator to load huge data/design matrix gradually


Simple example using cross-validation in python
#================================================
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = MinMaxScaler()

from scipy.sparse import dia_matrix
import numpy as np

def mahalanobis(x=None, data=None, cov=None):
    x_minus_mu = x - np.mean(data)
    x_minus_mut=x_minus_mu.T
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = sp.linalg.pinv(cov)
#    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    left_term.shape[0]
    
    #z = dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).toarray()
    #z=np.zeros((left_term.shape[0],left_term.shape[0]))
    z=np.zeros((left_term.shape[0],left_term.shape[0]),dtype=np.float16)

#    z.shape
    for k in range(left_term.shape[0]):
#        print("running k for ... ", k)
        z[k][k]=0
        for j in range(x_minus_mut.shape[0]):
#            print("running j for ... ", j)
            z[k][k] = z[k][k]+left_term[k][j]*x_minus_mut[k][j]
            
#    mahal = np.dot(left_term, x_minus_mu.T)
    return z.diagonal()

def dataprep(data0=None,ref_data0=None):
    df_x = data0[list_interested].copy()
    ref_data = ref_data0[list_interested].copy()

    df_x['yybltactdt']=df_x['yybltactdt'].clip(lower=1900,upper=2020)
    df_x['sumnbrbdrm']=df_x['sumnbrbdrm'].clip(upper=20)
    df_x['storiesnbr']=df_x['storiesnbr'].clip(upper=5)
    df_x['sumnbrbath']=df_x['sumnbrbath'].clip(upper=10)
    df_x['livsqftnbr']=df_x['livsqftnbr'].clip(upper=10000)

    ref_data['yybltactdt']=ref_data['yybltactdt'].clip(lower=1900,upper=2020)
    ref_data['sumnbrbdrm']=ref_data['sumnbrbdrm'].clip(upper=20)
    ref_data['storiesnbr']=ref_data['storiesnbr'].clip(upper=5)
    ref_data['sumnbrbath']=ref_data['sumnbrbath'].clip(upper=10)
    ref_data['livsqftnbr']=ref_data['livsqftnbr'].clip(upper=10000)    
    
    scaler.fit(ref_data)
    df_x1=pd.DataFrame(scaler.transform(df_x))
    df_x1.columns='pcnt_'+df_x.columns    
    
    df_x1a=df_x1.corr()
    list_col0=list(df_x1.columns)
    
    list_2remove = []        
    for list1 in list_col0: 
    #            print(list1)
        if ( (df_x1a[list1].isnull().sum()==df_x1a.shape[0]) | (np.isnan(df_x1.loc[:,list1].var()) )  ):
            list_2remove.extend([list1])
     
    #        print('list to remove:  ', list_2remove)       
    list_col1=list(set(list_col0)-set(list_2remove))

    df_x1=df_x1[list_col1]
    df_x1.fillna(0.5, inplace=True)
      
    list_2remove = []
    for i in range(len(list_col1)):
    #            print('i is :', i)
        for j in range(i+1,len(list_col1)):
    #                print('j is :', j)
            if  abs(abs(np.corrcoef(df_x1[list(list_col1)[i]], df_x1[list(list_col1)[j]])[0,1]) -1)<0.0001 :
                list_2remove.extend([list_col1[j]])
    
    #        print('strong correlate list to remove:  ', list_2remove)              
    #list_col2=list(set(list_col1)-set(list_2remove))
    list_col2=set(list_col1)-set(list_2remove)
    df_x1.shape
    df_x2=df_x1[list_col2]

    return df_x2

from sklearn.model_selection import KFold 
from sklearn.model_selection import train_test_split

z0=pd.read_parquet('sample_data_163_with_address')
z0.shape
situscensid0=525271016 

list_item=z0.groupby(['situscensid'],as_index=False).agg({"puid": 'count'}).sort_values(['puid'],ascending=False) 
list_item.head()
list_item1=list_item.loc[(list_item.puid>=10) & (list_item.situscensid.notna()) & (list_item.situscensid!='nan') ]
list_item1.shape
list_item2=list_item1.situscensid
#    list_item1[0:5]

test_all=pd.DataFrame(index=range(1))[0:0]
i=1
j=1
for vars2 in list_item2:
    y0=z0[(z0.situscensid==vars2)]  
    y0.reset_index(inplace=True)  
    print('**********************---------running for censusid: ',vars2,' for size of data ',y0.shape[0])
    j=j+1
    i=1                

    kf = KFold(n_splits=10,random_state=i, shuffle=False) 
    # split into 10 folds, everytime choose 1 fold for testing, 9 fold for training, rotating.

    for train_index, test_index in kf.split(y0):
        train, test = y0.iloc[train_index], y0.iloc[test_index]
        print('---------running round ',i)
        i=i+1
    
        #train, test= train_test_split(y0, test_size=0.2, random_state=i) 
        #the above is for 80% training, 20% testing.       
        test0=test[(test.situsaddrstreetname.notna())].drop(columns=['yearbuilt'])
        #test0=test[(test.situscensid==situscensid0)  & (test.situsaddrstreetname.notna())].drop(columns=['yearbuilt'])
        test1=train.groupby(['situsaddrstreetname','yearbuilt'],as_index=False).agg({"master_puid": 'count'}).sort_values(['situsaddrstreetname'],ascending=True) 
        test1.columns=['situsaddrstreetname','yearbuilt','cnt']
        test1=test1[(test1.cnt>1)]
        #print('the size of test0 is ',test0.shape[0], ' the size of test1 is ', test1.shape[0])
        #test1[(test1.situsaddrstreetname=='ALASKA')]
        test2=pd.merge(test0,test1[['situsaddrstreetname','yearbuilt']],
                    left_on='situsaddrstreetname',right_on='situsaddrstreetname',how='left')
        #test2.sort_values(['situsaddrstreetname'],ascending=True).head(5)
        test2=test2[(test2.yearbuilt.notna())]
        #print('the size of test2 is ',test2.shape[0])
        #test.yearbuilt.value_counts(dropna=False)
        if test2.shape[0]>1:
       
            train1=dataprep(data0=train[list_interested], ref_data0=y0[list_interested])
            test3=dataprep(data0=test2[list_interested], ref_data0=y0[list_interested]) 
            print(train1.shape,train1.columns,test3.shape,test3.columns)

            list_inner=list(set(train1.columns) & set(test3.columns))
            #inner join 2 list of common parts

            if len(list_inner)>1: 
                test3=test3[list_inner]
                train1=train1[list_inner]
              #  if train1.shape[1]<=test3.shape[1]:
              #      test3=test3[train1.columns]
              #  else :
              #      train1=train1[test3.columns]

                test3['mahala'] = mahalanobis(x=test3, data=train1)
                test3['id']=test2.index

                test4=pd.merge(test2,test3,left_index=True, right_on='id',how='left')
                test2.shape                        
                test4[['master_puid','mahala','yearbuilt','address','situsaddrstreetname']].head(30)
                test5=test4.groupby(['master_puid'],as_index=False).agg({"mahala": 'min'})
                test5['is_pred']='1'
                test6=pd.merge(test4,test5,left_on=['master_puid','mahala'], right_on=['master_puid','mahala'],how='inner')
                test7=test6.sort_values(by=['master_puid','mahala'],ascending=[False,False]).drop_duplicates(subset =['master_puid'])
                test6.head()
                test6.shape
                #test6[(test6.master_puid=='6288741886')]

                #test6[['master_puid','mahala','yearbuilt','address','situsaddrstreetname','is_pred']].head(400)
                test8=pd.merge(test0,test7[['master_puid','yearbuilt','is_pred']],left_on=['master_puid'], right_on=['master_puid'],how='inner')    
                test8.columns
                test8[['master_puid', 'cntycd','yybltactdt','yearbuilt', 'is_pred','situsaddrstreetname', 'address']].head(70)
                test8['match']=np.where( ( (test8.yybltactdt.notna()) & abs(test8.yearbuilt-test8.yybltactdt)<3 ),1,0)
                test8.match.value_counts()                    
                test8['round']=i
                test8['situscensid']=vars2

                test_all=test_all.append(test8)

#test_all.to_parquet('geo_oc', engine='fastparquet',compression='GZIP')
#================================================


Question: Common package/library/module to load for machine learning
Answer:
#================================================
#sklearn: powerful package:  preprocessing, model_selection, linear_model
#sklearn.cluster,feature_extration,feature_selection, datasets,
#sklearn.ensemble: combine several models together 

from sklearn import preprocessing
from sklearn import model_selection as cv
from sklearn import linear_model as lm
from sklearn.cluster import KMeans  ##for clustering 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel 

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import linear_model  
### fit for generalized linear regression: including  Ridge regression, 
Bayesian Regression, Lasso and Elastic Net estimators,Stochastic Gradient Descent.
from sklearn.linear_model import Ridge

from sklearn.datasets import make_blobs 
#create fake data(isotropic Gaussian) for clustering
make_blobs(n_samples=100, n_features=2, centers=3, cluster_std=1.0, 
center_box=(-10.0, 10.0), shuffle=True, random_state=0)

from sklearn.datasets import load_iris
from sklearn.datasets import make_classification 
X, y = make_classification(n_features=4, random_state=0)
###create default of 2 clusters of points normally distributed by default.
make_classification(n_samples=100, n_features=20, n_informative=2,n_redundant=2, 
n_repeated=0,n_classes=2,n_clusters_per_class=2,weights=None, flip_y=0.01, 
class_sep=1.0,hypercube=True,shift=0.0,scale=1.0,shuffle=True, random_state=0)
#================================================

Example of using linear support vector Classifier:
#================================================ 
from sklearn.svm import LinearSVC #linear support vector Classifier 
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
X.shape

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
X, y = make_classification(n_features=4, random_state=0)
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X, y)
print(clf.coef_)
print(clf.intercept_)
print(clf.predict([[0, 0, 0, 0]]))
#================================================

k-nearest neighbor classification example:
#================================================     
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(),
max_samples=0.5, max_features=0.5)
#================================================

Feature selection example:
#================================================  
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
X.shape

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
#perform chi-square test to select only the two best features
X_new.shape 
#================================================

Ensember modeling: VotingClassifier example:
#================================================ 
#1.try the ensemble method: VotingClassifier
from sklearn.ensemble import VotingClassifier
seed =12345
estimators = []
model1 = LogisticRegression(random_state=seed)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier(random_state=seed)
estimators.append(('DecisionTree', model2))
model3=RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth
=5,random_state=seed)
estimators.append(('Random Forest', model3))
model4=GaussianNB()
estimators.append(('Gaussian NB', model4))
model5 = SVC(kernel='linear')
estimators.append(('SVM', model5))
model6 = xgb.XGBClassifier(random_state=seed,learning_rate=0.01)
estimators.append(('XGB', model6))
# create the ensemble model
ensemble = VotingClassifier(estimators)
ensemble.fit(xtrain,ytrain)
pred_ensember = ensemble.predict(xtest)
Metrics(ytest,pred_ensember)
#================================================

Ensember modeling: StackingClassifier example:
#================================================
 #2. try the stacking method, no improvement
from mlxtend.classifier import StackingClassifier
seed =12345
model1 = LogisticRegression(random_state=seed)
model2 = DecisionTreeClassifier(random_state=seed)
model3=RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth
=5,random_state=seed)
model4=GaussianNB()
model5 = SVC(kernel='linear')
model6 = xgb.XGBClassifier(random_state=seed,learning_rate=0.01)
sclf = StackingClassifier(classifiers=[model2, model3, model4,model5,model6],
meta_classifier=model1)
sclf.fit(xtrain,ytrain)
pred_ensember = sclf.predict(xtest)
Metrics(ytest,pred_ensember)
model = StackingClassifier(classifiers=[model2, model3, model4,model5,model6],
meta_classifier=model1)
kfold = model_selection.KFold(n_splits=10, random_state=seed)
results = model_selection.cross_val_score(model, data_scaled,data_Outcome, cv=
kfold)
print('\n******************************************')
print('Average Score for cross validation:{0:0.2f}'.format(results.mean()))
print('******************************************')
#================================================


Question: How many different ways we handle the categorical varaibles in Python ?


Answer: For encoding categorical features, there are usually 3 ways: LabelEncoder and OneHotEncoder, DictVectorizer, Pandas get_dummies. Here are the major difference:

1. Pandas get_dummies is the "best"! If you are not sure about which one of the three to use, use get_dummies. It's the most common way to create dummy variables, it actually add the dummies variables onto the dataframe, so it changes dataframe.

2. LabelEncoder converts a categorical column into the column with label fro 0 to n-1, using numbers instead of raw text description, even though there is nothing really meaningful for the order of those number 0,1,2.

3. Unlike get_dummies, (OneHotEncoder)OHE does not add variables to your data frame. It creates dummy variables by transforming X, and all the dummies are stored in X.Also One-hot encoding of high cardinality features often results in an unrealistic amount of computational resource requirement. OHE only works with categories that are integers over or equal to zero, which means if the category names are strings or negative numbers, you will have to do some transformation before fitting OHE.
#===============================================================
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
x[:, 0] = labelencoder.fit_transform(x[:, 0])  
#the first column will become a column of number from 0--(n-1).

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])
x = onehotencoder.fit_transform(x).toarray()
#it will not add actual columns to the dataframe

bin_mileage_dummies = pd.get_dummies(atm3['bin_mileage'],prefix='mileage')
bin_condition_dummies = pd.get_dummies(atm3['bin_condition'],prefix='condition')
atm4=pd.concat([atm3,bin_mileage_dummies,bin_condition_dummies], axis=1)

#to get dummies for column A and D only, not for the other columns:
pd.get_dummies(df, prefix=['A', 'D'], columns=['A', 'D'])

#Code in redshift to generate the percentile and decile 
drop_table('data_name1')
ut0 = run_query("""create table data_name1
        DISTSTYLE EVEN 
        as select distinct ymmt,mid, mileage, percent_rank() 
over (partition by ymmt order by mileage)  as mileage_pcnt
        from   data_name0         
""")

drop_table('data_name2')
ut0 = run_query("""create table data_name2
        DISTSTYLE EVEN 
        as select distinct ymmt,mid, mileage, ntile(10)
over (partition by ymmt order by mileage)  as mileage_pcnt
        from   data_name0         
""")#===============================================================
On the normal setting for the training and testing dataset, sometimes you might have different number of categories from the training and testing, for example, for some categorical variable var1, training data has 10 categories, and testing has 15 categories, How can we generate the complete set of dummies to train the model, also do the scoring later?

An intuitive approach is to combine the training and testing together, then to get_dummies. Or another tricky/more coding approach is to get the complete set of dummies by the following, after you use df1=pd.get_dummies(df, prefix=['A', 'D'], columns=['A', 'D']), then you can add more columns to df1, setting them to 0.
#===============================================================
def add_missing_dummy_columns( d, columns ):
    missing_cols = set( columns ) - set( d.columns )
    for c in missing_cols:
        d[c] = 0
add_missing_dummy_columns(df1,["cat1_new","cat2_new","cat_new3"])        
#===============================================================

Question: How do we break a list column to multiple bins, decile or quantile?
Answer: we can use the cut or qcut function in python to break a list or column into multiple bins (with equal length for each bin),or decile, quantitle (each bin have similar size of data).
#===============================================================
label = [1,2,3,4,5]
label = [1,2,3,4,5,6,7,8,9,10]
atm2.dtypes
bins=5
atm2['bin_mileage']=pd.cut(atm2.mileage_function, bins,labels=label)
atm2['bin_condition']=pd.cut(atm2['condition_function'], bins,labels=label)
atm2['qbin_condition']=pd.qcut(atm2['condition_function'], bins,labels=label)
atm2['bin_age']=pd.cut(atm2.age_function, bins,labels=label)
atm2.columns
atm2.head()
atm2.qbin_condition.value_counts()
atm2a=atm2.dropna()
atm3=atm2a.reset_index(drop=True)
atm3.shape

#check=numpy.digitize(atm2.mileage_function, 10)
bin_mileage_dummies = pd.get_dummies(atm3['bin_mileage'],prefix='mileage')
bin_condition_dummies = pd.get_dummies(atm3['bin_condition'],prefix='condition')
bin_age_dummies = pd.get_dummies(atm3['bin_age'],prefix='age')
region_dummies = pd.get_dummies(atm3['mmr_region'])
color_dummies = pd.get_dummies(atm3['color1'])

atm4=pd.concat([atm3,region_dummies,color_dummies,bin_mileage_dummies,bin_condition_dummies,bin_age_dummies], axis=1) 
#===============================================================

There are several functions provided that compute various statistics about some of the regression models in scikit-learn.
#================================================
from sklearn import linear_model
from regressors import stats
ols = linear_model.LinearRegression()
ols.fit(X, y)
##there are 8 summary statistics
1. stats.sse(ols, X, y)
2.stats.adj_r2_score(clf, X, y)
3.stats.coef_se(clf, X, y)
4.stats.coef_tval(clf, X, y)
5.stats.coef_pval(clf, X, y)
6.stats.f_stat(clf, X, y)
7.stats.residuals(clf, X, y)
8.stats.summary(clf, X, y, Xlabels)
#================================================
To create a design matrix for the dummies of categorical variables, it will take tons of space if there are hundred or thousands of different levels of the categorical variable.

We can use the sparsity matrix to help reduce the size of the design matrix, sparsity works the best when we have a lot of 0s in the design matrix, in other words, we only save non-zero elements in the sparsity matrix. here is a sample code of using sparsity matrix, loading subset of data from aws/redshift first, then convert that subset of data to design matrix by sparsity matrix, appending them piece by piece together.

scipy.sparse.lil_matrix: list of lists sparse matrix(Row-based),
This is a structure for constructing sparse matrices incrementally. Note that inserting a single item can take linear time in the worst case; to construct a matrix efficiently, make sure the items are pre-sorted by index, per row.

Another common sparsity matrix: dia_matrix: Sparse matrix with DIAgonal storage.
#================================================
our_model = """log_price ~  
    C(cat_var1,Treatment(reference=ref_value1)
    +C(cat_var2,Treatment(reference=ref_value2)
    +num_var1 + num_var2"""

matrix_data_template_query = """
    select cat_var1, cat_var2,
        avg(num_var1) num_var1,
        avg(num_var2) num_var2
    from redshift_data1
    where var='{segment}'
    group by cat_var1, cat_var2
    """    
#get patsy template design_infos with all levels 
template_df = get_data_frame(matrix_data_template_query
.format(segment=segment))

templY, templX = pts.dmatrices(our_model, 
data=template_df,return_type='dataframe')
    
#get number of rows in training data
seg_cnt = get_data_frame(segment_query.format(
segment=segment, cols='count(*) as train_cnt', 
start_dt='1990-01-01', end_dt=training_end))
seg_cnt

#rows and cols count for sparse matrix init
row_count = seg_cnt.train_cnt[0]
col_count = len(templX.design_info.column_names)

###define the sparsity matrix dimension###
from scipy import sparse
train_lil_X = sparse.lil_matrix((row_count, col_count))
#define sparsity matrix lil_matrix
trainY = []
    
engine = sa.create_engine('redshift+psycopg2://
{user}:{password}@{host}:{port}/{dbname}'.format(
            dbname=DATABASE, 
            user=USER, 
            password=PASSWORD,
            port=PORT,
            host=HOST),
        server_side_cursors=True,
##very necessary for grabbing data one by one##        
        encoding='utf-8',
        connect_args={"keepalives": 1,
        "keepalives_idle": 60, "keepalives_interval": 60}  
          )
sdg_minibatch_size = 10000

result = engine.execute(segment_query.format(
segment=segment, cols=segment_query_cols,
start_dt='1990-01-01', end_dt=training_end))
result.keys()

r = result.fetchmany(sdg_minibatch_size)

# incrementally build sparse matrix
    k=0
    while r:
        print('\rprocessing rows', k, end="", flush=True)
    
        df = pd.DataFrame(r, columns=result.keys())

    # get chunk of design matrix
        partY, partX = pts.dmatrices((templY.design_info, 
        templX.design_info), df)
        partY = partY.ravel()
    
    # append chunk of design matrix to sparse matrix
        train_lil_X[k:k + partX.shape[0], :] = partX
        trainY = np.append(trainY, partY)
    
        k = k + partX.shape[0]
    
    # get next chunk of 10000 rows
        r = result.fetchmany(sdg_minibatch_size)
    
    print()
    print('trainY:', trainY.shape, 'train_lil_X:', train_lil_X.shape,
    'train_lil_X non-zero:', train_lil_X.count_nonzero())

#TODO: convert to other format for fast matrix operation: csr,csc,coo 
#csr: Compressed Sparsity Row format
#csc: Compressed Sparsity Colomn format
#coo: COOrdinate format
#consider using the COO format when constructing large matrices
    train_csr_X = train_lil_X.tocsr()

#fit linear model
    lr = lm.LinearRegression(fit_intercept=False)
    lr.fit(X=train_csr_X, y=trainY)
    len(lr.coef_)
    
    data_merge=pd.DataFrame(list(zip(lr.coef_, 
    templX.design_info.column_names)),
    columns=['Coefficient','Variable'])
#================================================ 



Simple example to calculate mahalanobis distance in Python
Answer:
#================================================
import pandas as pd
import scipy as sp
import numpy as np

filepath = 'https://raw.githubusercontent.com/selva86/datasets/master/diamonds.csv'
df = pd.read_csv(filepath).iloc[:, [0,4,6]]
df.head()

def mahalanobis(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the distribution from which Mahalanobis distance of each observation of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    mahal = np.dot(left_term, x_minus_mu.T)
    return mahal.diagonal()

df_x = df[['carat', 'depth', 'price']].head(500)
df_x['mahala'] = mahalanobis(x=df_x, data=df[['carat', 'depth', 'price']])
df_x.head()
#================================================ 

Mahalanobis distance can be also used for classification problems. The intuition is that, an observation is assigned the class that it is closest to based on the Mahalanobis distance. One Class classification is a type of algorithm where the training dataset contains observations belonging to only one class.

With only that information known, the objective is to figure out if a given observation in a new (or test) dataset belongs to that class.You might wonder when would such a situation occur. Well, it’s a quite common problem in Data Science.

For example consider the following situation: You have a large dataset containing millions of records that are NOT yet categorized as 1’s and 0’s. But you also have with you a small sample dataset containing only positive (1’s) records. By learning the information in this sample dataset, you want to classify all the records in the large dataset as 1’s and 0’s.

Based on the information from the sample dataset, it is possible to tell if any given sample is a 1 or 0 by viewing only the 1’s (and having no knowledge of the 0’s at all).
#================================================
df = pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/BreastCancer.csv', 
                 usecols=['Cl.thickness', 'Cell.size', 'Marg.adhesion', 
                          'Epith.c.size', 'Bare.nuclei', 'Bl.cromatin', 'Normal.nucleoli', 
                          'Mitoses', 'Class'])

df.dropna(inplace=True)  # drop missing values.

from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(df.drop('Class', axis=1), df['Class'], test_size=.5, random_state=100)

# Split the training data as pos and neg
xtrain_pos = xtrain.loc[ytrain == 1, :]

class MahalanobisOneclassClassifier():
    def __init__(self, xtrain, significance_level=0.01):
        self.xtrain = xtrain
        self.critical_value = chi2.ppf((1-significance_level), df=xtrain.shape[1]-1)
        print('Critical value is: ', self.critical_value)

    def predict_proba(self, xtest):
        mahalanobis_dist = mahalanobis(xtest, self.xtrain)
        self.pvalues = 1 - chi2.cdf(mahalanobis_dist, 2)
        return mahalanobis_dist

    def predict(self, xtest):
        return np.array([int(i) for i in self.predict_proba(xtest) > self.critical_value])

clf = MahalanobisOneclassClassifier(xtrain_pos, significance_level=0.05)
mahalanobis_dist = clf.predict_proba(xtest)

# Pred and Truth
mdist_actuals = pd.DataFrame([(m, act) for m, act in zip(mahalanobis_dist, ytest)], columns=['mahal', 'true_class'])
print(mdist_actuals[:5])            

# using quantile cut in 10 pieces
mdist_actuals['quantile'] = pd.qcut(mdist_actuals['mahal'], 
                                    q=[0, .10, .20, .3, .4, .5, .6, .7, .8, .9, 1], 
                                    labels=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
# sort by mahalanobis distance 
mdist_actuals.sort_values('mahal', inplace=True)
perc_truths = mdist_actuals.groupby('quantile').agg({'mahal': np.mean, 'true_class': np.sum}).rename(columns={'mahal':'avg_mahaldist', 'true_class':'sum_of_trueclass'})
print(perc_truths)
#================================================ 

Issue: Memory Error at Python while converting to array

Answer: When we tried to calculate mahalanobis distance via dia_matrix.toarray(), you might get this MemoryError message, for instance, if you have a 0.5M*0.5M matrix with dtype=int8, one int8 item will take 1 byte, so it will take up space: 500000*500000*1/(1024*1024*1024)>=230 GB of space; if it's dtype=float32, then one float32 item will take up 4 bytes, so it will take space:500000*500000*4/(1024*1024*1024) GB. When converting to array for operations: dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).toarray(), you will get the memoryerror. Note there is no memoryerror if it's just define: dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).

Here is the previous lazy version code to calculate mahalanobis distance, it takes unnecessary computation, and long time.
#================================================
#def mahalanobis(x=None, data=None, cov=None):
#    """Compute the Mahalanobis Distance between each row of x and the data  
#    x    : vector or matrix of data with, say, p columns.
#    data : ndarray of the data from which Mahalanobis dist of each record of x is to be computed.
#    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
#    """
#    x_minus_mu = x - np.mean(data)
#    if not cov:
#        cov = np.cov(data.values.T)
#    inv_covmat = sp.linalg.inv(cov)
#    left_term = np.dot(x_minus_mu, inv_covmat)
#    mahal = np.dot(left_term, x_minus_mu.T)
#    return mahal.diagonal()
#================================================

Here is the revised version:
#================================================
def mahalanobis(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the data from which Mahalanobis dist of each record of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    x_minus_mut=x_minus_mu.T
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = sp.linalg.pinv(cov)
#    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    left_term.shape[0]
   
    from sys import getsizeof
    getsizeof(z)/(1024*1024*1024)
    import datetime

    time_start = datetime.datetime.now()
 
    z=np.zeros((left_term.shape[0],left_term.shape[0]),dtype=np.float16) 
    #better than using dia_matrix, since no float16 in dia_matrix, or lil_matrix, save half space.
    #z=dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).toarray()
    #z=lil_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).toarray()
    print('the size of z in GB:',round(getsizeof(z)/(1024*1024*1024)),2)
    for k in range(left_term.shape[0]):
#        print("running k for ... ", k)
        z[k][k]=0
        for j in range(x_minus_mut.shape[0]):
#            print("running j for ... ", j)
            z[k][k] = z[k][k]+left_term[k][j]*x_minus_mut[k][j]
            
    time_end = datetime.datetime.now()
    time_mins=(time_end-time_start)
    print("seconds taking: ", time_mins)
    all0['mahala'] = z.diagonal()
    print(round(sum(all0['mahala'].head(10)),2))

#    mahal = np.dot(left_term, x_minus_mu.T)
    return z.diagonal()
#================================================ 
Issue: memory error when convert a huge matrix to array
Answer: When you play the previous code, you might realize it's running totally fine on your laptop(Macbook Pro eg.), but when you run on that the GPC cluster or VM jupyter environment, you got this memoryerror message, even if you change the dypte=int8, which is almost the smallest data size. Why is that? even if you have a super large GCP cluster, much better than the laptop.

The answer is likely due to the default system setup of overcommit handling mode, in the default mode, vm_overcommit_memory=0
.

You can check your current overcommit mode by running in a terminal:
$ cat /proc/sys/vm/overcommit_memory
0 (most of time)


You can change to the other value of 1 to avoid this issue, in the root, you run:
$ echo 1 > /proc/sys/vm/overcommit_memory
cat /proc/sys/vm/overcommit_memory
1 (the new value)


This will enable "always overcommit" mode, and you'll find that indeed the system will allow you to make the allocation no matter how large it is (within 64-bit memory addressing at least). I tested this myself on a GCP cluster (Ubuntu 18), with overcommit mode 0 I also got a MemoryError, but after changing it back to 1 it works.

If the memory error happened on windows machine, you can also check the page file setup, try to increase page file size.

If you got the permission denied message when you change the overcommit_memory=1 in the terminal, you can do the following:
1. continue using "pwd" and "cd .." and "ls -a" until you go to the home directory, or use: ”sudo -i“, or "sudo su -", you will see a few folders like: etc home root proc sys
2. "cd root"
3. echo "vm.overcommit_memory=1" >>/etc/sysctl.conf
You will something like the following:
fs.suid_dumpable=0
kernel.randomize_va_space =2 
...
net.ipv4.tcp_syncookies =1
vm.overcommit_memory=1


4. after you run the above code, double check the setting for overcommit_memory by running: "cat /proc/sys/vm/overcommit_memory", you should get the value of 1, instead of 0.

Once you have finished changing the setting for overcommit_memory, now you have to change back to your working directory to run python, you should get some direcotry like: (base) your_user_name. If you are still under the directory: root@cluster_or_machine_name home, you might get trouble to run python since you might get some error message: importerror, can't find module ...., in that case, you simply open a new terminal to run your python code.





No comments:

Post a Comment

GCP Study notes 13: Architecting with Google Kubernetes Engine: Foundations (courseRA notes)

Architecting with Google Compute Engine Specialization : 4 Courses in this Specialization. 1. Google Cloud Platform Fundamentals: Core In...