Prévia do material em texto
# Importando bibliotecas necessárias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
# Configurações gerais
sns.set(style='whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
# Carregando o conjunto de dados
data = pd.read_csv('Titanic-Dataset.csv') # Altere o nome do arquivo conforme necessário
data.head()
# Informações gerais sobre o dataset
data.info()
# Estatísticas descritivas
data.describe()
# Selecionando as variáveis importantes para o modelo e excluindo as irrelevantes
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
# Tratando valores ausentes
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
# Codificando variáveis categóricas
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
data['Embarked'] = LabelEncoder().fit_transform(data['Embarked'])
# Selecionando features e target
X = data.drop(['Survived'], axis=1) # Remove colunas não informativas
y = data['Survived']
scaler = StandardScaler()
X[['Age', 'Fare']] = scaler.fit_transform(X[['Age', 'Fare']])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)
# Avaliação
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
roc_auc_logreg = roc_auc_score(y_test, logreg.predict_proba(X_test)[:, 1])
print("Acurácia Regressão Logística:", accuracy_logreg)
print("ROC AUC Regressão Logística:", roc_auc_logreg)
print(classification_report(y_test, y_pred_logreg))
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
# Avaliação
accuracy_rf = accuracy_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1])
print("Acurácia Random Forest:", accuracy_rf)
print("ROC AUC Random Forest:", roc_auc_rf)
print(classification_report(y_test, y_pred_rf))
results = pd.DataFrame({
'Modelo': ['Regressão Logística', 'Random Forest'],
'Acurácia': [accuracy_logreg, accuracy_rf],
'ROC AUC': [roc_auc_logreg, roc_auc_rf]
})
print(results)
# Exemplo: Distribuição de Idade por Sobrevivência
sns.histplot(data=data, x='Age', hue='Survived', multiple='stack', kde=True)
plt.title('Distribuição de Idade por Sobrevivência')
plt.show()
# Seleção de variáveis importantes para o modelo
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
# Tratamento de valores ausentes
data['Age'].fillna(data['Age'].median(), inplace=True)
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
# Codificação das variáveis categóricas
from sklearn.preprocessing import LabelEncoder
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
# Codificação "one-hot" para 'Embarked'
data = pd.get_dummies(data, columns=['Embarked'], drop_first=True)
# Separação da variável alvo e variáveis explicativas
X = data.drop('Survived', axis=1)
y = data['Survived']
# Remover as colunas desnecessárias logo no início (apenas uma vez)
data = data.drop(columns=['Name', 'Ticket', 'Cabin'], errors='ignore')
# Selecionar variáveis importantes para o modelo
data = data[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare']]
# Tratamento de valores ausentes sem `inplace=True`
data['Age'] = data['Age'].fillna(data['Age'].median())
# Codificação das variáveis categóricas
from sklearn.preprocessing import LabelEncoder
data['Sex'] = LabelEncoder().fit_transform(data['Sex'])
# Codificação "one-hot" para 'Embarked'
# Separação da variável alvo e variáveis explicativas
X = data.drop('Survived', axis=1)
y = data['Survived']
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 PassengerId 891 non-null int64
1 Survived 891 non-null int64
2 Pclass 891 non-null int64
3 Name 891 non-null object
4 Sex 891 non-null object
5 Age 714 non-null float64
6 SibSp 891 non-null int64
7 Parch 891 non-null int64
8 Ticket 891 non-null object
9 Fare 891 non-null float64
10 Cabin 204 non-null object
11 Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
Acurácia Regressão Logística: 0.8100558659217877
ROC AUC Regressão Logística: 0.8823680823680824
precision recall f1-score support
0 0.83 0.86 0.84 105
1 0.79 0.74 0.76 74
accuracy 0.81 179
macro avg 0.81 0.80 0.80 179
weighted avg 0.81 0.81 0.81 179
:32: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignmen
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].metho
data['Age'].fillna(data['Age'].median(), inplace=True)
:33: FutureWarning: A value is trying to be set on a copy of a DataFrame or Series through chained assignmen
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values
For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].metho
data['Embarked'].fillna(data['Embarked'].mode()[0], inplace=True)
Acurácia Random Forest: 0.8156424581005587
ROC AUC Random Forest: 0.9001287001287002
precision recall f1-score support
0 0.83 0.87 0.85 105
1 0.80 0.74 0.77 74
accuracy 0.82 179
macro avg 0.81 0.80 0.81 179
weighted avg 0.81 0.82 0.81 179
Modelo Acurácia ROC AUC
0 Regressão Logística 0.810056 0.882368
1 Random Forest 0.815642 0.900129