1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import json, re
from collections import Counter
from math import log, inf
from functools import cache
from typing import List
"""After training, model_dict is a global variable which is accessible inside this module"""
@cache
def tokenize(text):
return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")] if len(y)]
def train(dataset):
model_dict = {}
count_of_word_by_outcome = {}
num_data_points = len(dataset)
count_of_data_points_with_outcome = Counter()
for point in dataset:
name = point['name']
classification = point['classification']
count_of_data_points_with_outcome[classification] += 1
if classification not in count_of_word_by_outcome:
count_of_word_by_outcome[classification] = Counter()
words = set(tokenize(point['contents']))
for word in words:
count_of_word_by_outcome[classification][word] += 1
model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome
model_dict['num_data_points'] = num_data_points
model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome
return model_dict
"""
TODO - Implement the following functions.
In each of these functions, you can use the MODEL variable which is a dictionary which has three members:
MODEL['count_of_word_by_outcome'][outcome][word] = Total number of documents in the category 'outcome' in which this word appears
MODEL['num_data_points'] = Total number of documents in the data set
MODEL['count_of_data_points_with_outcome'][outcome] = Total number of documents in the category 'outcome'
"""
@cache
def pr_outcome(outcome : str) : # Pr(outcome)
return 0
@cache
def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome)
return 0
def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words)
return 0
def predict(outcomes, words):
best = None
best_likelihood = -inf
for outcome in outcomes:
pr = pr_outcome_given_words(words, outcome)
if pr > best_likelihood:
best = outcome
best_likelihood = pr
return best