tutorial 5

.pdf

School

University of California, Berkeley *

*We aren’t endorsed by this school

Course

144

Subject

Industrial Engineering

Date

Oct 30, 2023

Type

pdf

Pages

Uploaded by BarristerJay3669

import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/0 (e.g. pd.read csv) from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score # read data df train = pd.read _csv('train.csv') df_test = pd.read_csv('test.csv') df _train.shape (891, 12) df test.shape (418, 11) # list of features list(df_train.columns) [ 'Passengerld’, 'Survived', 'Pclass’, "Name', 'Sex’, 'Age’, 'SibSp’, '"Parch’, 'Ticket', 'Fare’, 'Cabin’, "Embarked’ ] # here we choose to drop nan values, but you can choose how to deal with such cases on your own df_train = df_train.dropna() # use gender to predict survival df_train2 = df_train[['Sex']] # one-hot encode categorical variable one_hot_encoded = pd.get_dummies(df_train2['Sex'], prefix='Sex') # Concatenate the one-hot encoded columns with the original DataFrame df _encoded = pd.concat([df_train2, one_hot encoded], axis=1).drop('Sex', axis = 1) df_encoded.head() Sex_female Sex_male 1 1 0 3 1 0 6 0 1 10 1 0 1 1 0 y _train = df_train['Survived'] y_train 1 1 3 1 6 0 10 1 11 1 871 1 872 0 879 1 887 1 889 1 Name: Survived, Length: 183, dtype: inté64 # train DT Classifier clf = DecisionTreeClassifier() clf.fit(df_encoded, y train) accuracy_score(clf.predict(df_encoded), y_train) 0.7431693989071039 # split training data into train/validation for validation from sklearn.model selection import train_test split X _train, X val, Y _train, Y _val = train_test split(df _encoded,y train) clf = DecisionTreeClassifier() clf.fit(X_train, Y_train) accuracy_score(clf.predict(X_train), Y_train) 0.7299270072992701 accuracy_score(clf.predict(X val), Y val) #this is more representative of your kaggle score 0.782608695652174 # IMP: YOU DONT HAVE TO DO THIS. You can try other more more useful replacement for na. But don't drop na df_test = df_test.fillna(9) df _test2 = df_test[['Sex']] # one-hot encode categorical variable one_hot_encoded2 = pd.get dummies(df test2['Sex'], prefix='Sex') # Concatenate the one-hot encoded columns with the original DataFrame df_encoded test = pd.concat([df test2, one_hot encoded2], axis=1).drop('Sex’', axis = 1) clf.predict(df_encoded test) array([0, 1, 0, 0, 1, 0, 1, 0, 1, @, @, ©, 1, @0, 1, 1, @, @, 1, 1, O, O, 1, 6,1, 0,1, 0, 0,0, 0,0,1,1, 90, 0,1,1, 0, 9, 9, 9, 0, 1, 1, 6, 0,0, 1,1, 0,0,1,1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 6,06, 1,1, 0, 1, 0, 1, 6, 6, 1, 6, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1) 0) 1) e) 1) 0) e, 0, 1’ 0) 1) eJ 1) e’ e) e) 1J 0) e) 0) e’ a.’ e,1, 1, 1, 1,0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 9, 0O, 1) 9, 0, e, e) e) 1, 0, 1) 1) e) e) eJ 0) e) e) e) 01 1) e’ e’ 1’ 9, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 9, @, @, O, O, 1 - - - 17 - o e o L Y L5 e e e Y 1% e ‘e e Y [

6,0,90,0,90,1,90,0,90,0,1,0,90,0,90,0,90,0,1,1, 0,0, 0,1, 0,90,1,90,1,0,0,0,0,0, 1,1, 1, 1,1, 0,1, 0, 9, ¢]) pe df_test2.sha (418, 1) # create dataframe to submit df_test.PassengerId) pd.DataFrame(index submission test) f_encoded clf.predict(d submission[ 'Survived'] submission.head() Survived PassengerId 892 893 894 895 # convert to csv for submission False) ().to_csv('submission.csv', index submission.reset_index

Your preview ends here

Eager to read complete document? Join bartleby learn and gain access to the full version