#Data Mining Python Examples # #Scikit-learn # #Scikit-Learn tutorial: #http://scikit-learn.org/stable/tutorial/basic/tutorial.html ##################################### #Importing Modules sklearn and numpy# ##################################### from sklearn import * import numpy as np #arange(5) numpy.arange(5) np.arange(5) ################################# #Built in Data Sets from sklearn# ################################# #Importing iris data iris=datasets.load_iris() iris type(iris) #Iris independent variables iris.data type(iris.data) np.shape(iris.data) iris.data.shape #Iris Dependent variable (Target Variable) iris.target type(iris.target) iris.target.shape ########################### #Training and Testing Data# ########################### iris.data.shape iris.data.shape[0] np.arange(iris.data.shape[0]) train,test = cross_validation.train_test_split(np.arange(iris.data.shape[0]), test_size=0.3, random_state=5366) train test iris.data[train,:] iris.data[test,:] iris.target[train] iris.target[test] ################ #Decision Trees# ################ model=tree.DecisionTreeClassifier() model=model.fit(iris.data[train,:],iris.target[train]) predspecies=model.predict(iris.data[test,:]) metrics.confusion_matrix(iris.target[test],predspecies) metrics.accuracy_score(iris.target[test],predspecies) #Plotting the tree (Requires pydot and graphviz) #http://www.graphviz.org/ from sklearn.externals.six import StringIO import pydot dot_data = StringIO() tree.export_graphviz(model, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("IrisTree.pdf") ######################### #K-Fold Cross-validation# ######################### scores=cross_validation.cross_val_score( model,iris.data,iris.target,cv=10) scores np.mean(scores) ##################### #K-Nearest Neighbors# ##################### model=neighbors.KNeighborsClassifier() model=model.fit(iris.data[train,:],iris.target[train]) predspecies=model.predict(iris.data[test,:]) metrics.confusion_matrix(iris.target[test],predspecies) metrics.accuracy_score(iris.target[test],predspecies) ###################### #Gaussian Naive Bayes# ###################### #Note that we are using Gaussian Naive Bayes since the #features in this problem (Sepal Length, Sepal Width, etc) #are quantitative. #See the documentation for examples with multinomial Naive Bayes, etc. model=naive_bayes.GaussianNB() model=model.fit(iris.data[train,:],iris.target[train]) predspecies=model.predict(iris.data[test,:]) metrics.confusion_matrix(iris.target[test],predspecies) metrics.accuracy_score(iris.target[test],predspecies) ######################## #Support Vector Machine# ######################## model=svm.SVC() model=model.fit(iris.data[train,:],iris.target[train]) predspecies=model.predict(iris.data[test,:]) metrics.confusion_matrix(iris.target[test],predspecies) metrics.accuracy_score(iris.target[test],predspecies) ######### #Bagging# ######### model=ensemble.BaggingClassifier() model=model.fit(iris.data[train,:],iris.target[train]) predspecies=model.predict(iris.data[test,:]) metrics.confusion_matrix(iris.target[test],predspecies) metrics.accuracy_score(iris.target[test],predspecies) ########## #Boosting# ########## model=ensemble.AdaBoostClassifier() model=model.fit(iris.data[train,:],iris.target[train]) predspecies=model.predict(iris.data[test,:]) metrics.confusion_matrix(iris.target[test],predspecies) metrics.accuracy_score(iris.target[test],predspecies) ############### #Random Forest# ############### model=ensemble.RandomForestClassifier() model=model.fit(iris.data[train,:],iris.target[train]) predspecies=model.predict(iris.data[test,:]) metrics.confusion_matrix(iris.target[test],predspecies) metrics.accuracy_score(iris.target[test],predspecies)