In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [3]:
import os
os.getcwd()
Out[3]:
In [4]:
os.chdir('C:\\Users\\John Robertson\\Documents\\python_test')
os.getcwd()
Out[4]:
In [7]:
headerList = []
headerFile = open('field_names.txt','r')
for line in headerFile:
nextHeader = line.rstrip()
if nextHeader:
headerList.append(nextHeader)
headerFile.close()
print(headerList)
In [8]:
df = pd.read_csv('breast-cancer.csv', header = None, names = headerList, index_col = 0)
In [112]:
print("Dimensions are " + str(df.shape))
print("Number of malignant " + str(len(df[df.diagnosis == 'M'])))
print("Number of benign " + str(len(df[df.diagnosis == 'B'])))
print(df.iloc[0]) # take a look at an element
In [9]:
# create a normalized version, where each variable is centered and normalized by the std dev
df_numerical = df.drop(['diagnosis'],axis = 1)
df_numerical_norm = (df_numerical - df_numerical.mean())/df_numerical.std()
df_norm = df.loc[:,['diagnosis']].join(df_numerical_norm)
print(df_norm.iloc[0]) # take a look at an element
In [10]:
# We want to plot the data
# Visualization can help us recognize dangers, unusual features,
# and our end results should correspond with what we can see visually
# so it helps prevent techinical mistakes from
# leading us to wrong conclusions
for label in headerList[2:]:
bins = np.linspace(-4,4,100)
plt.hold(True)
plt.hist(df_norm[label][df.diagnosis == 'B'],bins, alpha = .5, label = 'B')
plt.hist(df_norm[label][df.diagnosis == 'M'],bins, alpha = .5, label = 'M')
plt.legend(loc='upper right')
plt.suptitle(label)
plt.show()
In [11]:
# Compute the mean and median smoothness and compactness for benign and malignant tumors -
# do they differ?
# Explain how you would identify this.
# Answer. We have three columns for smoothness and three columns for compactness.
# It is not clear what smoothness_sd_error or compactness_sd_error mean. Without more understanding of the data
# I would assume that I am being asked for the mean and median of the columns smoothness_mean and compactness_mean
# it should be noted that the visual histograms plotted above give a meaningful answer already.
# I am computing a normalized mean and median which makes it easy to tell by inspection that their difference is significant
print("Malignant smoothness mean = " + str(np.mean(df_norm.smoothness_mean[df.diagnosis == 'M'])))
print("Benign smoothness mean = " + str(np.mean(df_norm.smoothness_mean[df.diagnosis == 'B'])))
print("Malignant smoothness median = " + str(np.median(df_norm.smoothness_mean[df.diagnosis == 'M'])))
print("Benign smoothness median = " + str(np.median(df_norm.smoothness_mean[df.diagnosis == 'B'])))
# If this was for a scientific study was going to be published then I would use traditional statistical tests -- some version of
# students t-test is the standard I believe.
# Problems like this that are classical statistics
# are not commonly called "big data" because they were doable before the days of terabytes of data and hardware
# capable of processing that. In fact, they could be computed (tediously) before computers existed by hand.
# By inspection, since there are 500 computations, the variation in the means should be on magnitute of 1/sqrt(200) or about 1/14
# But instead they differ by 1.1 and by .9. So they are roughly 8 sample deviations apart, which means they are genuinely different
In [12]:
# Write a function to generate bootstrap samples of the data
# Bootstrap samples are samples with replacement so to get a sample of N rows with replacement we would use
from random import randint
def getSamples(n,dataFrame):
newList = []
rowCount = len(dataFrame)
for i in range(n):
newList.append(randint(0, rowCount-1))
return df.iloc[newList]
In [13]:
# test our Bootstrap function to generate a set of 10 samples
getSamples(10,df_norm)
Out[13]:
In [14]:
# Random forest variable importance is a common way
# to pick out which variables are most important
from sklearn.ensemble import ExtraTreesClassifier
forest = ExtraTreesClassifier(n_estimators = 500)
forest.fit(df_numerical_norm, df.diagnosis)
importances = forest.feature_importances_
# importance_stds = np.std([tree.feature_importances_ for tree in forest.estimators_], axis = 0)
importance_indices = np.argsort( importances )[::-1]
for i in range(df_numerical_norm.shape[1]):
print( list(df_numerical_norm)[i] + " " + str(importances[i]))
print("-------------")
print("In order of importance")
for i in range(df_numerical_norm.shape[1]):
j = importance_indices[i]
print( list(df_numerical_norm)[j] + " " + str(importances[j]))
plt.plot(range( len(importance_indices)), importances[ importance_indices ], 'ro')
plt.show()
In [15]:
# The plot of variable importance using random forests is very useful
# Offhand, it is not necessarily best to just grab the top 3 or 5
# most important variables. We see distinct groups of variables with
# comparable importance in this plot, and it may be that they have comparable
# importance because they are strongly correlated, i.e. possibly variables ranked
# 6,7,8 above are so close in importance because they are tightly correlated
# and each one gives no more information than the others. But we have cut down the
# playing field of interesting variables significantly.
In [37]:
# Identify 2-3 variables that are predictive of a malignant tumor.
# Display the relationship visually and write 1-2 sentences explaining the relationship.
# The two strongest ones are fractal_dimension_mean and concavity_worst and malignant tumors
# have larger values of both of those. I don't know precisely how those geometric quantities were
# measured. Offhand, one sounds like it means malignant tumors have a more pitted and crinkled surface.
# I have already displayed the relationship visually with the histograms above.
plt.plot(df_norm.fractal_dimension_mean[df.diagnosis == 'B'], df_norm.concavity_worst[df.diagnosis == 'B'],'o', alpha = 0.2, label='Benign')
plt.plot(df_norm.fractal_dimension_mean[df.diagnosis == 'M'], df_norm.concavity_worst[df.diagnosis == 'M'],'o', alpha = 0.2, label='Malignant')
plt.legend(loc = 'upper left')
plt.axis('scaled')
plt.show()
# Plotting these two variables for both groups together it appears that they are not too strongly correlated
# and that each of these two variable independently helps reduce the overlap between malignant and benign tumors
# That is, the x,y pairs are more separated than either the x coordinates alone or the y coordinates alone would be
In [18]:
from sklearn.cross_validation import cross_val_score
In [19]:
X = df_numerical_norm
Y = df_norm.diagnosis
In [20]:
forest = ExtraTreesClassifier(n_estimators = 500)
In [21]:
forest_result = cross_val_score(forest, X, Y, cv = 5)
In [22]:
print(forest_result)
In [23]:
# These scores are the portion of correctly classified samples.
# These are good scores, and they are consistent scores.
# One of the downsides of cross validation in python is that it doesn't
# return the scores on the training set as well as on the test set.
# You will normally see better scores on the training set than on the
# test set. But if you see significantly better scores on the
# training set than on the test set, that is because you are overfitting
# the data. Effectively, these scores are so high that we
# know we are not overfitting dramatically anyway.
In [24]:
# I already determine the most important variables in a random forest model
In [25]:
# I like SVMs but they are poor at helping you identify the most important variables
# So for the second case I will just use linear regression
In [26]:
from sklearn.svm import SVC
In [27]:
svm = SVC()
In [28]:
svmResult = cross_val_score(svm, X, Y, cv = 5)
In [29]:
svmResult
Out[29]:
In [30]:
# It is not easy from an SVM to determine what the most important variables are.
# SVMs are more of a black box. They are best where there is
# sparse data and you want a black box predictor rather than insight
# about the meaning of the predictions. That is why they
# are used so frequently in computer vision when the data is ALWAYS sparse.
# We know we didn't overfit because the results are so very high.
# Overfitting with a linear SVM is very unlikely. They specialize in being robust
# against overfitting.
No comments:
Post a Comment