Email Spam Classifier.ipynb

Collins Akala

Data Analyst

Jupyter Notebook

Python

In [2]:

df = pd.read_csv('mail_data.csv')

In [3]:

df.head(5)

Out[3]:

Category Message 0 ham Go until jurong point, crazy.. Available only ... 1 ham Ok lar... Joking wif u oni... 2 spam Free entry in 2 a wkly comp to win FA Cup fina... 3 ham U dun say so early hor... U c already then say... 4 ham Nah I don't think he goes to usf, he lives aro...

In [4]:

data = df.where((pd.notnull(df)), '')

In [5]:

data.head()

Out[5]:

In [6]:

data.info() <class 'pandas.core.frame.DataFrame'> RangeIndex: 5572 entries, 0 to 5571 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Category 5572 non-null object 1 Message 5572 non-null object dtypes: object(2) memory usage: 87.2+ KB

In [7]:

data.shape

Out[7]:

(5572, 2)

In [8]:

data.loc[data['Category'] == 'spam', 'Category'] = 0 data.loc[data['Category'] == 'ham', 'Category'] =1

In [9]:

X = data['Message'] Y = data['Category']

In [10]:

X.head()

Out[10]:

0 Go until jurong point, crazy.. Available only ... 1 Ok lar... Joking wif u oni... 2 Free entry in 2 a wkly comp to win FA Cup fina... 3 U dun say so early hor... U c already then say... 4 Nah I don't think he goes to usf, he lives aro... Name: Message, dtype: object

In [11]:

Y.head()

Out[11]:

0 1 1 1 2 0 3 1 4 1 Name: Category, dtype: object

In [12]:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state =3)

In [13]:

print(X.shape) print(X_train.shape) print(X_test.shape) (5572,) (4457,) (1115,)

In [14]:

print(Y.shape) print(Y_train.shape) print(Y_test.shape) (5572,) (4457,) (1115,)

In [15]:

feature_extraction = TfidfVectorizer(min_df =1, stop_words ='english', lowercase='True') X_train_features = feature_extraction.fit_transform(X_train) X_test_features = feature_extraction.transform(X_test) Y_train = Y_train.astype('int') Y_test = Y_test.astype('int')

In [16]:

print(X_train) (0, 5413) 0.6198254967574347 (0, 4456) 0.4168658090846482 (0, 2224) 0.413103377943378 (0, 3811) 0.34780165336891333 (0, 2329) 0.38783870336935383 (1, 4080) 0.18880584110891163 (1, 3185) 0.29694482957694585 (1, 3325) 0.31610586766078863 (1, 2957) 0.3398297002864083 (1, 2746) 0.3398297002864083 (1, 918) 0.22871581159877646 (1, 1839) 0.2784903590561455 (1, 2758) 0.3226407885943799 (1, 2956) 0.33036995955537024 (1, 1991) 0.33036995955537024 (1, 3046) 0.2503712792613518 (1, 3811) 0.17419952275504033 (2, 407) 0.509272536051008 (2, 3156) 0.4107239318312698 (2, 2404) 0.45287711070606745 (2, 6601) 0.6056811524587518 (3, 2870) 0.5864269879324768 (3, 7414) 0.8100020912469564 (4, 50) 0.23633754072626942 (4, 5497) 0.15743785051118356 : : (4454, 4602) 0.2669765732445391 (4454, 3142) 0.32014451677763156 (4455, 2247) 0.37052851863170466 (4455, 2469) 0.35441545511837946 (4455, 5646) 0.33545678464631296 (4455, 6810) 0.29731757715898277 (4455, 6091) 0.23103841516927642 (4455, 7113) 0.30536590342067704 (4455, 3872) 0.3108911491788658 (4455, 4715) 0.30714144758811196 (4455, 6916) 0.19636985317119715 (4455, 3922) 0.31287563163368587 (4455, 4456) 0.24920025316220423 (4456, 141) 0.292943737785358 (4456, 647) 0.30133182431707617 (4456, 6311) 0.30133182431707617 (4456, 5569) 0.4619395404299172 (4456, 6028) 0.21034888000987115 (4456, 7154) 0.24083218452280053 (4456, 7150) 0.3677554681447669 (4456, 6249) 0.17573831794959716 (4456, 6307) 0.2752760476857975 (4456, 334) 0.2220077711654938 (4456, 5778) 0.16243064490100795 (4456, 2870) 0.31523196273113385

In [24]:

input_your_mail = ["This is the second time we have tried to contact u. u have won A$400 price. 2 claim is easy, just call 087104711148 NOW! Only 10p per minute. BT.national.rate"] input_data_features = feature_extraction.transform(input_your_mail) prediction = model.predict(input_data_features) print(prediction) if(prediction[0]==1): print('Ham mail') else: print('Spam mail') # Well obviously not because all the people in my cool college life went habsome

In [25]:

input_your_mail = ["Well obviously not because all the people in my cool college life went habsome"] input_data_features = feature_extraction.transform(input_your_mail) prediction = model.predict(input_data_features) print(prediction) if(prediction[0]==1): print('Ham mail') else: print('Spam mail')

Like this project

Posted Nov 2, 2023

Email spam classifier. Contribute to PopeCollins/Email-Spam-CLassifier development by creating an account on GitHub.

Likes

Views

Email Spam Classifier.ipynb

Join 50k+ companies and 1M+ independents