import json, re from collections import Counter from math import log, inf from functools import cache from typing import List """After training, model_dict is a global variable which is accessible inside this module""" @cache def tokenize(text): return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")] if len(y)] def train(dataset): model_dict = {} count_of_word_by_outcome = {} num_data_points = len(dataset) count_of_data_points_with_outcome = Counter() for point in dataset: name = point['name'] classification = point['classification'] count_of_data_points_with_outcome[classification] += 1 if classification not in count_of_word_by_outcome: count_of_word_by_outcome[classification] = Counter() words = set(tokenize(point['contents'])) for word in words: count_of_word_by_outcome[classification][word] += 1 model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome model_dict['num_data_points'] = num_data_points model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome return model_dict """ TODO - Implement the following functions. In each of these functions, you can use the MODEL variable which is a dictionary which has three members: MODEL['count_of_word_by_outcome'][outcome][word] = count_of_word_in_all_documents MODEL['num_data_points'] = number_of_documents_in_the_dataset MODEL['count_of_data_points_with_outcome'][outcome] = number_of_documents_in_the_data_set_which_have_the_correct_outcome_as_outcome """ @cache def pr_outcome(outcome : str) : # Pr(outcome) return 0 @cache def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome) return 0 def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words) return 0 def predict(outcomes, words): best = None best_likelihood = -inf for outcome in outcomes: pr = pr_outcome_given_words(words, outcome) if pr > best_likelihood: best = outcome best_likelihood = pr return best