bayes.py 2.12 KB
import json, re
from collections import Counter
from math import log, inf
from functools import cache
from typing import List

"""After training, model_dict is a global variable which is accessible inside this module"""

@cache
def tokenize(text):
    return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")]  if len(y)]

def train(dataset):
    model_dict = {}
    count_of_word_by_outcome = {}
    num_data_points = len(dataset)
    count_of_data_points_with_outcome = Counter()
    for point in dataset:
        name = point['name']
        classification = point['classification']
        count_of_data_points_with_outcome[classification] += 1
        if classification not in count_of_word_by_outcome:
            count_of_word_by_outcome[classification] = Counter()
        words = set(tokenize(point['contents']))
        for word in words:
            count_of_word_by_outcome[classification][word] += 1

    model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome
    model_dict['num_data_points'] = num_data_points
    model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome

    return model_dict

"""
TODO - Implement the following functions.
In each of these functions, you can use the MODEL variable which is a dictionary which has three members:
    MODEL['count_of_word_by_outcome'][outcome][word] = count_of_word_in_all_documents
    MODEL['num_data_points'] = number_of_documents_in_the_dataset
    MODEL['count_of_data_points_with_outcome'][outcome] = number_of_documents_in_the_data_set_which_have_the_correct_outcome_as_outcome
"""
@cache
def pr_outcome(outcome : str) : # Pr(outcome)
    return 0

@cache
def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome)
    return 0

def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words)
    return 0

def predict(outcomes, words):
    best = None
    best_likelihood = -inf
    for outcome in outcomes:
        pr = pr_outcome_given_words(words, outcome)
        if  pr > best_likelihood:
            best = outcome
            best_likelihood = pr
    return best