Email Spam Classifier.ipynb

Collins Akala

Data Analyst
Jupyter Notebook
Python
In [2]:
df = pd.read_csv('mail_data.csv')
In [3]:
df.head(5)
Out[3]:
Category Message 0 ham Go until jurong point, crazy.. Available only ... 1 ham Ok lar... Joking wif u oni... 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 3 ham U dun say so early hor... U c already then say... 4 ham Nah I don't think he goes to usf, he lives aro...
In [4]:
data = df.where((pd.notnull(df)), '')
In [5]:
data.head()
Out[5]:
Category Message 0 ham Go until jurong point, crazy.. Available only ... 1 ham Ok lar... Joking wif u oni... 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 3 ham U dun say so early hor... U c already then say... 4 ham Nah I don't think he goes to usf, he lives aro...
In [6]:
data.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 5572 entries, 0 to 5571 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Category 5572 non-null object 1 Message 5572 non-null object dtypes: object(2) memory usage: 87.2+ KB
In [7]:
data.shape
Out[7]:
(5572, 2)
In [8]:
data.loc[data['Category'] == 'spam', 'Category'] = 0 data.loc[data['Category'] == 'ham', 'Category'] =1
In [9]:
X = data['Message'] Y = data['Category']
In [10]:
X.head()
Out[10]:
0 Go until jurong point, crazy.. Available only ... 1 Ok lar... Joking wif u oni... 2 Free entry in 2 a wkly comp to win FA Cup fina... 3 U dun say so early hor... U c already then say... 4 Nah I don't think he goes to usf, he lives aro... Name: Message, dtype: object
In [11]:
Y.head()
Out[11]:
0 1 1 1 2 0 3 1 4 1 Name: Category, dtype: object
In [12]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state =3)
In [13]:
print(X.shape) print(X_train.shape) print(X_test.shape) (5572,) (4457,) (1115,)
In [14]:
print(Y.shape) print(Y_train.shape) print(Y_test.shape) (5572,) (4457,) (1115,)
In [15]:
feature_extraction = TfidfVectorizer(min_df =1, stop_words ='english', lowercase='True') X_train_features = feature_extraction.fit_transform(X_train) X_test_features = feature_extraction.transform(X_test) Y_train = Y_train.astype('int') Y_test = Y_test.astype('int')
In [16]:
print(X_train) (0, 5413) 0.6198254967574347 (0, 4456) 0.4168658090846482 (0, 2224) 0.413103377943378 (0, 3811) 0.34780165336891333 (0, 2329) 0.38783870336935383 (1, 4080) 0.18880584110891163 (1, 3185) 0.29694482957694585 (1, 3325) 0.31610586766078863 (1, 2957) 0.3398297002864083 (1, 2746) 0.3398297002864083 (1, 918) 0.22871581159877646 (1, 1839) 0.2784903590561455 (1, 2758) 0.3226407885943799 (1, 2956) 0.33036995955537024 (1, 1991) 0.33036995955537024 (1, 3046) 0.2503712792613518 (1, 3811) 0.17419952275504033 (2, 407) 0.509272536051008 (2, 3156) 0.4107239318312698 (2, 2404) 0.45287711070606745 (2, 6601) 0.6056811524587518 (3, 2870) 0.5864269879324768 (3, 7414) 0.8100020912469564 (4, 50) 0.23633754072626942 (4, 5497) 0.15743785051118356 : : (4454, 4602) 0.2669765732445391 (4454, 3142) 0.32014451677763156 (4455, 2247) 0.37052851863170466 (4455, 2469) 0.35441545511837946 (4455, 5646) 0.33545678464631296 (4455, 6810) 0.29731757715898277 (4455, 6091) 0.23103841516927642 (4455, 7113) 0.30536590342067704 (4455, 3872) 0.3108911491788658 (4455, 4715) 0.30714144758811196 (4455, 6916) 0.19636985317119715 (4455, 3922) 0.31287563163368587 (4455, 4456) 0.24920025316220423 (4456, 141) 0.292943737785358 (4456, 647) 0.30133182431707617 (4456, 6311) 0.30133182431707617 (4456, 5569) 0.4619395404299172 (4456, 6028) 0.21034888000987115 (4456, 7154) 0.24083218452280053 (4456, 7150) 0.3677554681447669 (4456, 6249) 0.17573831794959716 (4456, 6307) 0.2752760476857975 (4456, 334) 0.2220077711654938 (4456, 5778) 0.16243064490100795 (4456, 2870) 0.31523196273113385
In [24]:
input_your_mail = ["This is the second time we have tried to contact u. u have won A$400 price. 2 claim is easy, just call 087104711148 NOW! Only 10p per minute. BT.national.rate"] input_data_features = feature_extraction.transform(input_your_mail) prediction = model.predict(input_data_features) print(prediction) if(prediction[0]==1): print('Ham mail') else: print('Spam mail') # Well obviously not because all the people in my cool college life went habsome
In [25]:
input_your_mail = ["Well obviously not because all the people in my cool college life went habsome"] input_data_features = feature_extraction.transform(input_your_mail) prediction = model.predict(input_data_features) print(prediction) if(prediction[0]==1): print('Ham mail') else: print('Spam mail')
Partner With Collins
View Services

More Projects by Collins