# Use this cell to set up import statements for all of the packages that you
# plan to use
import pandas as pd
import numpy as py
import matplotlib.pyplot as plt
import seaborn as snb
# Remember to include a 'magic word' so that your visualizations are plotted
# inline with the notebook. See this page for more:
# http://ipython.readthedocs.io/en/stable/interactive/magics.html
%matplotlib inline
Upgrade pandas to use dataframe.explode() function.
!pip install --upgrade pandas==0.25.0
Data Wrangling
General Properties
In [2]:
# Load your data and print out a few lines. Perform operations to inspect data
# types and look for instances of missing or possibly errant data
import pandas as pd
df=pd.read_csv('noshowappointments-kagglev2-may-2016.csv')
df.head()
Out[2]:
PatientId AppointmentID Gender ScheduledDay AppointmentDay Age Neighbourhood Scholarship Hipertension Diabetes Alcoholism Handcap SMS_received No-show 0 2.987250e+13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:00Z 62 JARDIM DA PENHA 0 1 0 0 0 0 No 1 5.589978e+14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 0 0 0 0 0 No 2 4.262962e+12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:00Z 62 MATA DA PRAIA 0 0 0 0 0 0 No 3 8.679512e+11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:00Z 8 PONTAL DE CAMBURI 0 0 0 0 0 0 No 4 8.841186e+12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 1 1 0 0 0 No
In [3]:
#explore the shape of data
df.shape
Out[3]:
(110527, 14)
In [4]:
#check for dublicates
df.duplicated().sum()
Out[4]:
0
In [5]:
# check if there is a patiennts with the same id
df['PatientId'].duplicated().sum()
Out[5]:
48228
In [6]:
#check number of dublicated patients Ids and no show
#to check ifthere were patients Ids duplicates but differ in the no show status
df.duplicated(['PatientId','No-show']).sum()
PatientId AppointmentID Gender ScheduledDay AppointmentDay Age Neighbourhood Scholarship Hypertension Diabetes Alcoholism Handcap SMS_received No_show 0 2.987250e+13 5642903 F 2016-04-29T18:38:08Z 2016-04-29T00:00:00Z 62 JARDIM DA PENHA 0 1 0 0 0 0 No 1 5.589978e+14 5642503 M 2016-04-29T16:08:27Z 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 0 0 0 0 0 No 2 4.262962e+12 5642549 F 2016-04-29T16:19:04Z 2016-04-29T00:00:00Z 62 MATA DA PRAIA 0 0 0 0 0 0 No 3 8.679512e+11 5642828 F 2016-04-29T17:29:31Z 2016-04-29T00:00:00Z 8 PONTAL DE CAMBURI 0 0 0 0 0 0 No 4 8.841186e+12 5642494 F 2016-04-29T16:07:23Z 2016-04-29T00:00:00Z 56 JARDIM DA PENHA 0 1 1 0 0 0 No
In [13]:
#removing dublictaes Ids with duplicated showing statues ,leaving the dublicated id if they differ in showing statues
#as the same patient may have several appointments
df.drop_duplicates(['PatientId','No_show'],inplace=True)
df.shape
Out[13]:
(71816, 14)
In [14]:
#removing un necessary data
df.drop(['PatientId','AppointmentID','ScheduledDay','AppointmentDay'],axis=1,inplace=True)
df.head()
Out[14]:
Gender Age Neighbourhood Scholarship Hypertension Diabetes Alcoholism Handcap SMS_received No_show 0 F 62 JARDIM DA PENHA 0 1 0 0 0 0 No 1 M 56 JARDIM DA PENHA 0 0 0 0 0 0 No 2 F 62 MATA DA PRAIA 0 0 0 0 0 0 No 3 F 8 PONTAL DE CAMBURI 0 0 0 0 0 0 No 4 F 56 JARDIM DA PENHA 0 1 1 0 0 0 No
Research Question 1 (explore data and more information and take ageneral look)
In [16]:
df.hist(figsize=(16,6.5));
In [17]:
#dividing the patients into 2 groups according to showning or not the exploring them
show=df.No_show=='No'
noshow=df.No_show=='Yes'
df[show].count(),df[noshow].count()
#what is the percentage of male and female
def attendance (df,col_name,attended,absent):
plt.figure(figsize=[12,4])
df[col_name][show].value_counts(normalize=True).plot(kind='pie',label='show')
plt.legend();
plt.title('comparison acc.to gender')
plt.xlabel('Gender')
plt.ylabel('Patients Number');
attendance(df,'Gender',show,noshow)
In [23]:
#does age and chornic disease affect attendance to gether
plt.figure(figsize=[12,4])
df[show].groupby('Gender').Age.mean().plot(kind='bar',color='blue',label='show')
df[noshow].groupby('Gender').Age.mean().plot(kind='bar',color='red',label='noshow')
plt.legend();
plt.title('comparison acc.to age and chronic disesase')
plt.xlabel('GENDER')
plt.ylabel('Mean age');
In [24]:
print(df[show].groupby(['Gender']).Age.mean(),df[noshow].groupby('Gender').Age.mean(),
df[show].groupby(['Gender']).Age.median(),df[noshow].groupby('Gender').Age.median() )
Gender
F 39.130292
M 33.766269
Name: Age, dtype: float64 Gender
F 36.06501
M 31.22040
Name: Age, dtype: float64 Gender
F 39
M 32
Name: Age, dtype: int64 Gender
F 34
M 29
Name: Age, dtype: int64
there is no correlation bet. age and gender affect the show rate the mean and the median almost the same
number of showig patients whithout receiving sms is greater than showing patients with receving sms
In [26]:
# Continue to explore the data to address your additional research
# questions. Add more headers as needed if you have more questions to
# investigate.
patients attendance from specific neighbourhood differ according to ages AEROPORTO has the highest age attend
Conclusions
neighbourhood has a great effect on attendance , JARDIM CAMBURI has the greatest number of patients and also the greatest showing rate
number of showing patients from specific neighbourhood affected by receiving sms and age
number of showing patients without receiving sms is greater than showing with receiving sms
Limitations
no clear correlation between showing and gender, chornic disease
Submitting your Project
Like this project
0
Posted Aug 18, 2023
Dataset Description we have a csv file contain data which we downloaded(no shoes appointment) - InvestigateaDataset/Investigate_a_Dataset.ipynb at main · Ossm…