Code for Logistic Regression using PyTorch in Python Tutorial


View on Github

Open In Colab

logistic_regression_pytorch.py

# %%
!gdown --id 12vfq3DYFId3bsXuNj_PhsACMzrLTfObs

# %%
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.utils import resample
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from warnings import filterwarnings
filterwarnings('ignore')

# %%
#reading data
data = pd.read_csv("data_regression.csv")
##The dimension of the data is seen, and the output column is checked to see whether it is continuous or discrete. 
##In this case, the output is discrete, so a classification algorithm should be applied.
data = data.drop(["year", "customer_id", "phone_no"], axis=1)
print(data.shape)         # Lookiing the shape of the data
print(data.columns)       # Looking how many columns data has
data.dtypes  
data.head()

# %%
data.isnull().sum()

# %%
final_data = data.dropna()         # Dropping the null values
final_data.head()

# %%
final_data["churn"].value_counts()       
# let us see how many data is there in each class for deciding the sampling data number

# %%
data_majority = final_data[final_data['churn']==0] # class 0
data_minority = final_data[final_data['churn']==1] # class 1
# upsampling minority class
data_minority_upsampled = resample(data_minority, replace=True, n_samples=900, random_state=123) 
# downsampling majority class
data_majority_downsampled = resample(data_majority, replace=False, n_samples=900, random_state=123)
# concanating both upsampled and downsampled class
## Data Concatenation:  Concatenating the dataframe after upsampling and downsampling 
# concanating both upsampled and downsampled class
data2 = pd.concat([data_majority_downsampled, data_minority_upsampled])
## Encoding Catagoricals:  We need to encode the categorical variables before feeding it to the model
data2[['gender', 'multi_screen', 'mail_subscribed']]
# label encoding categorical variables
label_encoder = preprocessing.LabelEncoder()
data2['gender'] = label_encoder.fit_transform(data2['gender'])
data2['multi_screen'] = label_encoder.fit_transform(data2['multi_screen'])
data2['mail_subscribed'] = label_encoder.fit_transform(data2['mail_subscribed'])
## Lets now check again the distribution of the oputut class after sampling
data2["churn"].value_counts()

# %%
# indenpendent variable 
X = data2.iloc[:,:-1]
## This X will be fed to the model to learn params 
#scaling the data
sc = StandardScaler()         # Bringing the mean to 0 and variance to 1, so as to have a non-noisy optimization
X = sc.fit_transform(X)
X = sc.transform(X)
## Keeping the output column in a separate dataframe
data2 = data2.sample(frac=1).reset_index(drop=True) ## Shuffle the data frame and reset index
n_samples, n_features = X.shape ## n_samples is the number of samples and n_features is the number of features
#output column
Y = data2["churn"]
#output column
Y = data2["churn"]
##Data Splitting: 
## The data is processed, so now we can split the data into train and test to train the model with training data and test it later from testing data.
#splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=42, stratify = Y)
print((y_train == 1).sum())
print((y_train == 0).sum())

# %%
print(type(X_train))
print(type(X_test))
print(type(y_train.values))
print(type(y_test.values))

# %%
X_train = torch.from_numpy(X_train.astype(np.float32))
X_test = torch.from_numpy(X_test.astype(np.float32))
y_train = torch.from_numpy(y_train.values.astype(np.float32))
y_test = torch.from_numpy(y_test.values.astype(np.float32))

# %%
y_train.shape, y_test.shape

# %%
y_train = y_train.view(y_train.shape[0], 1)
y_test = y_test.view(y_test.shape[0], 1)

# %%
y_train.shape, y_test.shape

# %%
# logistic regression class
class LogisticRegression(nn.Module):
    def __init__(self, n_input_features):
        super(LogisticRegression, self).__init__()
        self.linear = nn.Linear(n_input_features, 1)
    
    #sigmoid transformation of the input 
    def forward(self, x):
        y_pred = torch.sigmoid(self.linear(x))
        return y_pred

# %%
lr = LogisticRegression(n_features)

# %%
num_epochs = 500
# Traning the model for large number of epochs to see better results  
learning_rate = 0.0001
criterion = nn.BCELoss()                                
# We are working on lgistic regression so using Binary Cross Entropy
optimizer = torch.optim.SGD(lr.parameters(), lr=learning_rate)      

# %%
for epoch in range(num_epochs):
    y_pred = lr(X_train)
    loss = criterion(y_pred, y_train)             
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if (epoch+1) % 20 == 0:                                         
        # printing loss values on every 10 epochs to keep track
        print(f'epoch: {epoch+1}, loss = {loss.item():.4f}')

# %%
with torch.no_grad():
    y_predicted = lr(X_test)
    y_predicted_cls = y_predicted.round()
    acc = y_predicted_cls.eq(y_test).sum() / float(y_test.shape[0])
    print(f'accuracy: {acc.item():.4f}')

# %%
#classification report
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predicted_cls))

# %%
#confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_predicted_cls)
print(confusion_matrix)

# %%