bayes.py 1.91 KB
import json, re
from collections import Counter
from math import log, inf
from functools import cache
from typing import List

@cache
def tokenize(text):
    return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")]  if len(y)]

def train(dataset):
    count_of_word_by_category = {}
    num_data_points = len(dataset)
    count_of_data_points_in_category = Counter()
    for point in dataset:
        name = point['name']
        classification = point['classification']
        count_of_data_points_in_category[classification] += 1
        if classification not in count_of_word_by_category:
            count_of_word_by_category[classification] = Counter()
        words = set(tokenize(point['contents']))
        for word in words:
            count_of_word_by_category[classification][word] += 1

    return (count_of_word_by_category, num_data_points, count_of_data_points_in_category)

"""
TODO - Implement the following functions.

After training (which will be run before your code), the following global 
variables are available:
    count_of_word_by_category[category][word] = Total number of documents in the 
        category 'category' in which this word appears

    num_data_points = Total number of documents in the data set

    count_of_data_points_in_category[category] = Total number of documents in 
        the category 'category'
"""
@cache
def pr_category(category : str) : # Pr(category)
    return 0

@cache
def pr_word_given_category(word : str, category : str, num_words_in_document : int): # Pr(word | category)
    return 0

def pr_category_given_words(words : List[str], category : str): # Pr(category | words)
    return 0

def predict(categories, words):
    best = None
    best_likelihood = -inf
    for category in categories:
        pr = pr_category_given_words(words, category)
        if  pr > best_likelihood:
            best = category
            best_likelihood = pr
    return best