Tuesday, December 24, 2019

Python study notes 3: traditional regression in Python, Sparsity matrix in Python


Common useful package/library/module for machine learning
Using Dummy Coding for categorical varaibles in Regression example
How do we use sparsity matrix to save memory for large design matrix
How do we use generator to load huge data/design matrix gradually


Question: Common package/library/module to load for machine learning
Answer:
#================================================
#sklearn: powerful package:  preprocessing, model_selection, linear_model
#sklearn.cluster,feature_extration,feature_selection, datasets,
#sklearn.ensemble: combine several models together 

from sklearn import preprocessing
from sklearn import model_selection as cv
from sklearn import linear_model as lm
from sklearn.cluster import KMeans  ##for clustering 

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel 

from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn import linear_model  
### fit for generalized linear regression: including  Ridge regression, 
Bayesian Regression, Lasso and Elastic Net estimators,Stochastic Gradient Descent.
from sklearn.linear_model import Ridge

from sklearn.datasets import make_blobs 
#create fake data(isotropic Gaussian) for clustering
make_blobs(n_samples=100, n_features=2, centers=3, cluster_std=1.0, 
center_box=(-10.0, 10.0), shuffle=True, random_state=0)

from sklearn.datasets import load_iris
from sklearn.datasets import make_classification 
X, y = make_classification(n_features=4, random_state=0)
###create default of 2 clusters of points normally distributed by default.
make_classification(n_samples=100, n_features=20, n_informative=2,n_redundant=2, 
n_repeated=0,n_classes=2,n_clusters_per_class=2,weights=None, flip_y=0.01, 
class_sep=1.0,hypercube=True,shift=0.0,scale=1.0,shuffle=True, random_state=0)
#================================================

Example of using linear support vector Classifier:
#================================================ 
from sklearn.svm import LinearSVC #linear support vector Classifier 
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectFromModel
iris = load_iris()
X, y = iris.data, iris.target
X.shape

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
model = SelectFromModel(lsvc, prefit=True)
X_new = model.transform(X)
X_new.shape

from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
X, y = make_classification(n_features=4, random_state=0)
clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X, y)
print(clf.coef_)
print(clf.intercept_)
print(clf.predict([[0, 0, 0, 0]]))
#================================================

k-nearest neighbor classification example:
#================================================     
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(),
max_samples=0.5, max_features=0.5)
#================================================

Feature selection example:
#================================================  
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
iris = load_iris()
X, y = iris.data, iris.target
X.shape

X_new = SelectKBest(chi2, k=2).fit_transform(X, y)
#perform chi-square test to select only the two best features
X_new.shape 
#================================================

Ensember modeling: VotingClassifier example:
#================================================ 
#1.try the ensemble method: VotingClassifier
from sklearn.ensemble import VotingClassifier
seed =12345
estimators = []
model1 = LogisticRegression(random_state=seed)
estimators.append(('logistic', model1))
model2 = DecisionTreeClassifier(random_state=seed)
estimators.append(('DecisionTree', model2))
model3=RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth
=5,random_state=seed)
estimators.append(('Random Forest', model3))
model4=GaussianNB()
estimators.append(('Gaussian NB', model4))
model5 = SVC(kernel='linear')
estimators.append(('SVM', model5))
model6 = xgb.XGBClassifier(random_state=seed,learning_rate=0.01)
estimators.append(('XGB', model6))
# create the ensemble model
ensemble = VotingClassifier(estimators)
ensemble.fit(xtrain,ytrain)
pred_ensember = ensemble.predict(xtest)
Metrics(ytest,pred_ensember)
#================================================

Ensember modeling: StackingClassifier example:
#================================================
 #2. try the stacking method, no improvement
from mlxtend.classifier import StackingClassifier
seed =12345
model1 = LogisticRegression(random_state=seed)
model2 = DecisionTreeClassifier(random_state=seed)
model3=RandomForestClassifier(n_estimators=25, min_samples_split=25, max_depth
=5,random_state=seed)
model4=GaussianNB()
model5 = SVC(kernel='linear')
model6 = xgb.XGBClassifier(random_state=seed,learning_rate=0.01)
sclf = StackingClassifier(classifiers=[model2, model3, model4,model5,model6],
meta_classifier=model1)
sclf.fit(xtrain,ytrain)
pred_ensember = sclf.predict(xtest)
Metrics(ytest,pred_ensember)
model = StackingClassifier(classifiers=[model2, model3, model4,model5,model6],
meta_classifier=model1)
kfold = model_selection.KFold(n_splits=10, random_state=seed)
results = model_selection.cross_val_score(model, data_scaled,data_Outcome, cv=
kfold)
print('\n******************************************')
print('Average Score for cross validation:{0:0.2f}'.format(results.mean()))
print('******************************************')
#================================================


Question: How many different ways we handle the categorical varaibles in Python ?


Answer: For encoding categorical features, there are usually 3 ways: LabelEncoder and OneHotEncoder, DictVectorizer, Pandas get_dummies. Here are the major difference:

1. Pandas get_dummies is the "best"! If you are not sure about which one of the three to use, use get_dummies. It's the most common way to create dummy variables, it actually add the dummies variables onto the dataframe, so it changes dataframe.

2. LabelEncoder converts a categorical column into the column with label fro 0 to n-1, using numbers instead of raw text description, even though there is nothing really meaningful for the order of those number 0,1,2.

3. Unlike get_dummies, (OneHotEncoder)OHE does not add variables to your data frame. It creates dummy variables by transforming X, and all the dummies are stored in X.Also One-hot encoding of high cardinality features often results in an unrealistic amount of computational resource requirement. OHE only works with categories that are integers over or equal to zero, which means if the category names are strings or negative numbers, you will have to do some transformation before fitting OHE.
#===============================================================
from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()
x[:, 0] = labelencoder.fit_transform(x[:, 0])  
#the first column will become a column of number from 0--(n-1).

from sklearn.preprocessing import OneHotEncoder
onehotencoder = OneHotEncoder(categorical_features = [0])
x = onehotencoder.fit_transform(x).toarray()
#it will not add actual columns to the dataframe

bin_mileage_dummies = pd.get_dummies(atm3['bin_mileage'],prefix='mileage')
bin_condition_dummies = pd.get_dummies(atm3['bin_condition'],prefix='condition')
atm4=pd.concat([atm3,bin_mileage_dummies,bin_condition_dummies], axis=1)

#to get dummies for column A and D only, not for the other columns:
pd.get_dummies(df, prefix=['A', 'D'], columns=['A', 'D'])

#Code in redshift to generate the percentile and decile 
drop_table('data_name1')
ut0 = run_query("""create table data_name1
        DISTSTYLE EVEN 
        as select distinct ymmt,mid, mileage, percent_rank() 
over (partition by ymmt order by mileage)  as mileage_pcnt
        from   data_name0         
""")

drop_table('data_name2')
ut0 = run_query("""create table data_name2
        DISTSTYLE EVEN 
        as select distinct ymmt,mid, mileage, ntile(10)
over (partition by ymmt order by mileage)  as mileage_pcnt
        from   data_name0         
""")#===============================================================
On the normal setting for the training and testing dataset, sometimes you might have different number of categories from the training and testing, for example, for some categorical variable var1, training data has 10 categories, and testing has 15 categories, How can we generate the complete set of dummies to train the model, also do the scoring later?

An intuitive approach is to combine the training and testing together, then to get_dummies. Or another tricky/more coding approach is to get the complete set of dummies by the following, after you use df1=pd.get_dummies(df, prefix=['A', 'D'], columns=['A', 'D']), then you can add more columns to df1, setting them to 0.
#===============================================================
def add_missing_dummy_columns( d, columns ):
    missing_cols = set( columns ) - set( d.columns )
    for c in missing_cols:
        d[c] = 0
add_missing_dummy_columns(df1,["cat1_new","cat2_new","cat_new3"])        
#===============================================================

Question: How do we break a list column to multiple bins, decile or quantile?
Answer: we can use the cut or qcut function in python to break a list or column into multiple bins (with equal length for each bin),or decile, quantitle (each bin have similar size of data).
#===============================================================
label = [1,2,3,4,5]
label = [1,2,3,4,5,6,7,8,9,10]
atm2.dtypes
bins=5
atm2['bin_mileage']=pd.cut(atm2.mileage_function, bins,labels=label)
atm2['bin_condition']=pd.cut(atm2['condition_function'], bins,labels=label)
atm2['qbin_condition']=pd.qcut(atm2['condition_function'], bins,labels=label)
atm2['bin_age']=pd.cut(atm2.age_function, bins,labels=label)
atm2.columns
atm2.head()
atm2.qbin_condition.value_counts()
atm2a=atm2.dropna()
atm3=atm2a.reset_index(drop=True)
atm3.shape

#check=numpy.digitize(atm2.mileage_function, 10)
bin_mileage_dummies = pd.get_dummies(atm3['bin_mileage'],prefix='mileage')
bin_condition_dummies = pd.get_dummies(atm3['bin_condition'],prefix='condition')
bin_age_dummies = pd.get_dummies(atm3['bin_age'],prefix='age')
region_dummies = pd.get_dummies(atm3['mmr_region'])
color_dummies = pd.get_dummies(atm3['color1'])

atm4=pd.concat([atm3,region_dummies,color_dummies,bin_mileage_dummies,bin_condition_dummies,bin_age_dummies], axis=1) 
#===============================================================

There are several functions provided that compute various statistics about some of the regression models in scikit-learn.
#================================================
from sklearn import linear_model
from regressors import stats
ols = linear_model.LinearRegression()
ols.fit(X, y)
##there are 8 summary statistics
1. stats.sse(ols, X, y)
2.stats.adj_r2_score(clf, X, y)
3.stats.coef_se(clf, X, y)
4.stats.coef_tval(clf, X, y)
5.stats.coef_pval(clf, X, y)
6.stats.f_stat(clf, X, y)
7.stats.residuals(clf, X, y)
8.stats.summary(clf, X, y, Xlabels)
#================================================
To create a design matrix for the dummies of categorical variables, it will take tons of space if there are hundred or thousands of different levels of the categorical variable.

We can use the sparsity matrix to help reduce the size of the design matrix, sparsity works the best when we have a lot of 0s in the design matrix, in other words, we only save non-zero elements in the sparsity matrix. here is a sample code of using sparsity matrix, loading subset of data from aws/redshift first, then convert that subset of data to design matrix by sparsity matrix, appending them piece by piece together.
#================================================
our_model = """log_price ~  
    C(cat_var1,Treatment(reference=ref_value1)
    +C(cat_var2,Treatment(reference=ref_value2)
    +num_var1 + num_var2"""

matrix_data_template_query = """
    select cat_var1, cat_var2,
        avg(num_var1) num_var1,
        avg(num_var2) num_var2
    from redshift_data1
    where var='{segment}'
    group by cat_var1, cat_var2
    """    
#get patsy template design_infos with all levels 
template_df = get_data_frame(matrix_data_template_query
.format(segment=segment))

templY, templX = pts.dmatrices(our_model, 
data=template_df,return_type='dataframe')
    
#get number of rows in training data
seg_cnt = get_data_frame(segment_query.format(
segment=segment, cols='count(*) as train_cnt', 
start_dt='1990-01-01', end_dt=training_end))
seg_cnt

#rows and cols count for sparse matrix init
row_count = seg_cnt.train_cnt[0]
col_count = len(templX.design_info.column_names)

###define the sparsity matrix dimension###
from scipy import sparse
train_lil_X = sparse.lil_matrix((row_count, col_count))
trainY = []
    
engine = sa.create_engine('redshift+psycopg2://
{user}:{password}@{host}:{port}/{dbname}'.format(
            dbname=DATABASE, 
            user=USER, 
            password=PASSWORD,
            port=PORT,
            host=HOST),
        server_side_cursors=True,
##very necessary for grabbing data one by one##        
        encoding='utf-8',
        connect_args={"keepalives": 1,
        "keepalives_idle": 60, "keepalives_interval": 60}  
          )
sdg_minibatch_size = 10000

result = engine.execute(segment_query.format(
segment=segment, cols=segment_query_cols,
start_dt='1990-01-01', end_dt=training_end))
result.keys()

r = result.fetchmany(sdg_minibatch_size)

# incrementally build sparse matrix
    k=0
    while r:
        print('\rprocessing rows', k, end="", flush=True)
    
        df = pd.DataFrame(r, columns=result.keys())

    # get chunk of design matrix
        partY, partX = pts.dmatrices((templY.design_info, 
        templX.design_info), df)
        partY = partY.ravel()
    
    # append chunk of design matrix to sparse matrix
        train_lil_X[k:k + partX.shape[0], :] = partX
        trainY = np.append(trainY, partY)
    
        k = k + partX.shape[0]
    
    # get next chunk of 10000 rows
        r = result.fetchmany(sdg_minibatch_size)
    
    print()
    print('trainY:', trainY.shape, 'train_lil_X:', train_lil_X.shape,
    'train_lil_X non-zero:', train_lil_X.count_nonzero())

#TODO: csr for start, try other formats
    train_csr_X = train_lil_X.tocsr()

#fit linear model
    lr = lm.LinearRegression(fit_intercept=False)
    lr.fit(X=train_csr_X, y=trainY)
    len(lr.coef_)
    
    data_merge=pd.DataFrame(list(zip(lr.coef_, 
    templX.design_info.column_names)),
    columns=['Coefficient','Variable'])
#================================================ 

Issue: Memory Error at Python while converting to array

Answer: When we tried to calculate mahalanobis distance via dia_matrix.toarray(), you might get this MemoryError message, for instance, if you have a 0.25M*0.25M matrix with dtype=float32, it's a huge matrix, when converting to array for operations: dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).toarray(), you will get the memoryerror. Note there is no memoryerror if it's just define: dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).

Here is the previous lazy version code to calculate mahalanobis distance, it takes unnecessary computation, and long time.
#================================================
#def mahalanobis(x=None, data=None, cov=None):
#    """Compute the Mahalanobis Distance between each row of x and the data  
#    x    : vector or matrix of data with, say, p columns.
#    data : ndarray of the data from which Mahalanobis dist of each record of x is to be computed.
#    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
#    """
#    x_minus_mu = x - np.mean(data)
#    if not cov:
#        cov = np.cov(data.values.T)
#    inv_covmat = sp.linalg.inv(cov)
#    left_term = np.dot(x_minus_mu, inv_covmat)
#    mahal = np.dot(left_term, x_minus_mu.T)
#    return mahal.diagonal()
#================================================

Here is the revised version:
#================================================
def mahalanobis(x=None, data=None, cov=None):
    """Compute the Mahalanobis Distance between each row of x and the data  
    x    : vector or matrix of data with, say, p columns.
    data : ndarray of the data from which Mahalanobis dist of each record of x is to be computed.
    cov  : covariance matrix (p x p) of the distribution. If None, will be computed from data.
    """
    x_minus_mu = x - np.mean(data)
    x_minus_mut=x_minus_mu.T
    if not cov:
        cov = np.cov(data.values.T)
    inv_covmat = sp.linalg.pinv(cov)
#    inv_covmat = sp.linalg.inv(cov)
    left_term = np.dot(x_minus_mu, inv_covmat)
    left_term.shape[0]
    
    # z = dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.float32).toarray()
    #z=dia_matrix((left_term.shape[0],left_term.shape[0]), dtype=np.int8).toarray().astype(np.float16)
    #z=np.zeros((left_term.shape[0],left_term.shape[0]))
     z=np.zeros((left_term.shape[0],left_term.shape[0]),dtype=np.float16)

#    z.shape
    for k in range(left_term.shape[0]):
#        print("running k for ... ", k)
        z[k][k]=0
        for j in range(x_minus_mut.shape[0]):
#            print("running j for ... ", j)
            z[k][k] = z[k][k]+left_term[k][j]*x_minus_mut[k][j]
            
#    mahal = np.dot(left_term, x_minus_mu.T)
    return z.diagonal()
#================================================ 



No comments:

Post a Comment

Python study notes 6: Spark SQL, Pyspark tutorial examples

Question: How to solve ? #=============================================================== #=========...