import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from scipy import stats as st


columns = ["Age", "Sex", "Chest Pain Type", "Resting Blood Pressure", "Cholesterol Level", "Fasting Blood Sugar", "Resting ECG", "Max Heart Rate", "Exercise Induced Angina", "ST Segment Depression", "Slope of Peak ST", "Major Vessels Colored", "Thalassemia", "Diagnosis"]

# read the file as a csv and assign appropriate column names. 
df = pd.read_csv("processed_cleveland.data", names=columns)
df.head()


df.shape

(303, 14)


missing = []
# loop through all rows in the dataframe with df.iterrows()
for i, row in df.iterrows():
    try:
        # check if each value in the row can be turned into a float
        [float(val) for val in list(row)]
    except:
        # if cannot be converted to a float, this row has invalid data.
        missing.append(i)
missing

[87, 166, 192, 266, 287, 302]


# drop the missing rows
df = df.drop(missing, axis=0)

df = df.loc[df["Diagnosis"] < 2]

# reset the index after dropping rows
df = df.reset_index()
df = df.drop(columns=["index"])
df.shape

(214, 14)


plt.figure(figsize=(16, 6))

# df.corr() finds the correlation matrix for our dataframe
corr = df.corr()

# create a mask to hide the top right portion of the heatmap, whose values are redundant
mask=np.triu(corr)
np.fill_diagonal(mask, False)

# create a visual heatmap using seaborn's heatmap(). We can change the colors of the cmap if we want, but I tend to like this color scheme. 
heatmap = sns.heatmap(corr, cmap=sns.diverging_palette(20, 220, n=200), vmin=-1, vmax=1, annot=True, mask=mask)
heatmap.set_title("Correlation Heatmap for UCI Heart Disease Data", fontdict={"fontsize":12}, pad=12)

plt.show()


# consider the range of unique ages
df["Age"].sort_values().unique()

array([29., 34., 35., 37., 39., 40., 41., 42., 43., 44., 45., 46., 47.,
       48., 49., 50., 51., 52., 53., 54., 55., 56., 57., 58., 59., 60.,
       61., 62., 63., 64., 65., 66., 67., 68., 69., 70., 71., 74., 76.])


# We have a range of 48 years, so let's group them into 4 bins of 12 years each.
labels = ["Ages 29 to 40", "Ages 41 to 52", "Ages 53 to 64", "Ages 65 to 76"]

# manually create the cutoff bins and group the ages using pd.groupby() and pd.cut()
bins = [29, 40, 52, 64, 76]
ages = df.groupby(pd.cut(df["Age"], bins))
age_dfs = []

# save all groups and let's print each group's size just to see!
for i, age in enumerate(ages.groups.keys()):
    curr = ages.get_group(age)
    age_dfs.append(curr)
    print(f"Length of bin {i + 1} ({labels[i]}): {len(curr)}")

Length of bin 1 (Ages 29 to 40): 13
Length of bin 2 (Ages 41 to 52): 83
Length of bin 3 (Ages 53 to 64): 91
Length of bin 4 (Ages 65 to 76): 26


# find the min and max of max heart rate column to keep the x-axis consistent
min = np.min(df["Max Heart Rate"])
max = np.max(df["Max Heart Rate"])

fig, axs = plt.subplots(4, figsize=(10, 15))

# create a histogram for each age
for i in range(4):
    axs[i].hist(age_dfs[i]["Max Heart Rate"], range=(min, max), alpha=0.7, label=labels[i])
    axs[i].set(title=f"Distribution of Max Heart Rates for {labels[i]}", ylabel="Frequency")

plt.xlabel("Max Heart Rate (Beats/Min)")
plt.tight_layout()
plt.show()


# Now for gender, group by the sex column
genders = df.groupby(["Sex"])
female_df = genders.get_group(0)
male_df = genders.get_group(1)

len_f = len(female_df)
len_m = len(male_df)

print("Number of Females: " + str(len_f))
print("Number of Males: " + str(len_m))

Number of Females: 80
Number of Males: 134


# find number of females and males who got diagnosed 
num_diag_f = np.sum(female_df["Diagnosis"])
num_diag_m = np.sum(male_df["Diagnosis"])

# plot using a bar plot!
fig, ax = plt.subplots(1)

x_axis = np.arange(2)

# note that the "x_axis +- 0.2 is there to offset each bar so they don't overlap. It's 0.2 because the width=0.4"
ax.bar(x_axis - 0.2, [len_f - num_diag_f, num_diag_f], width=0.4, color="tab:orange", alpha=0.7, label="Female")
ax.bar(x_axis + 0.2, [len_m - num_diag_m, num_diag_m], width=0.4, color="tab:blue", alpha=0.7, label="Male")

ax.set(title="Frequency Heart Disease Diagnosis by Sex", xlabel="Angiographic Diagnosis", ylabel="Frequency")

plt.xticks(x_axis)
plt.legend()
plt.show()

print(f"Proportion of women diagnosed with heart disease: {num_diag_f/len_f:.2f}")
print(f"Proportion of men diagnosed with heart disease: {num_diag_m/len_m:.2f}")

Proportion of women diagnosed with heart disease: 0.11
Proportion of men diagnosed with heart disease: 0.34


fig, ax = plt.subplots(1)

# plot two histograms in one plot!
ax.hist(female_df["Cholesterol Level"], alpha=0.5, color="tab:orange", label="Female")
ax.hist(male_df["Cholesterol Level"], alpha=0.5, color="tab:blue", label="Male")
ax.set(title="Cholesterol Level Distributions by Sex", xlabel="Serum Cholesterol Level (mg/dl)", ylabel="Frequency")

plt.legend()
plt.show()


fig, ax = plt.subplots(1)

# grab the variables we need
x1, y1 = female_df["Age"], female_df["Max Heart Rate"]
x2, y2 = male_df["Age"], male_df["Max Heart Rate"]

# use a scatterplot to plot the points
ax.scatter(x1, y1, color="tab:orange", alpha=0.7, label="Female")
ax.scatter(x2, y2, color="tab:blue", alpha=0.7, label="Male")
ax.set(title="Age vs Max Heart Rate by Sex", xlabel="Age", ylabel="Max Heart Rate (Beats/Min)")

# find the regression line for each dataset (men and women), and plot them as well
# since there's only one independent variable (age), we can use np.polyfit(). If we had more, we can use sklearn's LinearRegression()
m1, b1 = np.polyfit(x1, y1, 1)
ax.plot(x1, m1*x1+b1, color="green", alpha=0.7, label="Female Line")

m2, b2 = np.polyfit(x2, y2, 1)
ax.plot(x2, m2*x2+b2, color="purple", alpha=0.7, label="Male Line")

plt.legend()
plt.show()

print(f"For women, max heart rate drops by {abs(m1):.3f} beats/min per year on average")
print(f"For men, max heart rate drops by {abs(m2):.3f} beats/min per year on average")

For women, max heart rate drops by 0.859 beats/min per year on average
For men, max heart rate drops by 1.138 beats/min per year on average


# find the r-coefficient between age and max heart rate for each dataset
female_r = np.corrcoef(x1, y1)[0, 1]
male_r = np.corrcoef(x2, y2)[0, 1]
print(f"Pearson (r) coef for female dataset: {female_r}")
print(f"Pearson (r) coef for male dataset: {male_r}")

Pearson (r) coef for female dataset: -0.4601464683467836
Pearson (r) coef for male dataset: -0.4454897626779205


# Using the r coefficient, we can also calculate the r-squared value. 
print(f"R-squared value for female dataset: {female_r**2}")
print(f"R-squared value for male dataset: {male_r**2}")

R-squared value for female dataset: 0.2117347723320175
R-squared value for male dataset: 0.19846112865082993


female_samples = female_df["Max Heart Rate"]
male_samples = male_df["Max Heart Rate"]
st.ttest_ind(female_samples, male_samples, alternative="less")

Ttest_indResult(statistic=-0.5568603467159768, pvalue=0.28910511576684805)


print(f"Sample max heart rate mean for women: {np.mean(female_samples):.3f}")
print(f"Sample max heart rate mean for men: {np.mean(male_samples):.3f}")

Sample max heart rate mean for women: 154.375
Sample max heart rate mean for men: 156.015


# One-hot encoding for each of the three columns. Each of them is dropped and is replaced by new columns of boolean values (0 or 1)
df = pd.get_dummies(df, columns=["Chest Pain Type", "Resting ECG", "Thalassemia"])

# rename columns
df = df.rename(columns={"Chest Pain Type_1.0":"Typical Angina", "Chest Pain Type_2.0":"Atypical Angina", "Chest Pain Type_3.0":"Non-Anginal Pain", "Chest Pain Type_4.0":"Asymptomatic Angina",
                        "Resting ECG_0.0":"Normal ECG", "Resting ECG_1.0":"ST-T Abnormality ECG", "Resting ECG_2.0":"Hypertrophy ECG",
                        "Thalassemia_3.0":"Normal Thalassemia", "Thalassemia_6.0":"Fixed Defect Thalassemia", "Thalassemia_7.0":"Reversable Defect Thalassemia"})
df.head()


y = df["Diagnosis"]
X = df.drop(columns=["Diagnosis"])

# perform logistic regression using sklearn. Increased max_iter from default 100 to 1500 to remove a warning in which the regressor didn't converge
lgr = LogisticRegression(max_iter=1500)

scores = cross_val_score(lgr, X, y, scoring='accuracy', cv=KFold(n_splits=10, random_state=1, shuffle=True), n_jobs=-1)
print(f"Accuracy: {np.mean(scores):.3f}\nStandard Deviation: {np.std(scores):.3f}")

Accuracy: 0.799
Standard Deviation: 0.052


# split the train and test set, using sklearn's train_test_split(), for a more accurate representation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

lgr.fit(X_train, y_train)

# To remove a warning about decimal conversion
import warnings
warnings.filterwarnings("ignore")

# before predicting, convert dataframe/series to numpy array since train_test_split() shuffles indices of dataframes
prediction = np.round(lgr.predict([np.array(X_test)[0]]))
print(f"Predicted Class: {prediction[0]}\nActual Class: {np.array(y_test)[0]}")

Predicted Class: 0
Actual Class: 0

Exploring the Trends Within Heart Disease and Predicting Diagnosis¶

Introduction and Motivation¶

The Process¶

Summary¶

	Age	Sex	Chest Pain Type	Resting Blood Pressure	Cholesterol Level	Fasting Blood Sugar	Resting ECG	Max Heart Rate	Exercise Induced Angina	ST Segment Depression	Slope of Peak ST	Major Vessels Colored	Thalassemia	Diagnosis
0	63.0	1.0	1.0	145.0	233.0	1.0	2.0	150.0	0.0	2.3	3.0	0.0	6.0	0
1	67.0	1.0	4.0	160.0	286.0	0.0	2.0	108.0	1.0	1.5	2.0	3.0	3.0	2
2	67.0	1.0	4.0	120.0	229.0	0.0	2.0	129.0	1.0	2.6	2.0	2.0	7.0	1
3	37.0	1.0	3.0	130.0	250.0	0.0	0.0	187.0	0.0	3.5	3.0	0.0	3.0	0
4	41.0	0.0	2.0	130.0	204.0	0.0	2.0	172.0	0.0	1.4	1.0	0.0	3.0	0