From fa240df01991c14e4a16de9550699ae2b4e6f7a8 Mon Sep 17 00:00:00 2001 From: Yaksher <yaksher.git@gmail.com> Date: Sat, 11 Nov 2023 22:46:25 -0800 Subject: [PATCH] Revert "Rename outcomes to categories; unpack model_dict global" This reverts commit eb1b7980030e898d910ce48b14ec88698ffa313e. --- bayes.py | 49 +++++++++++++++++++++++++------------------------ main.py | 22 +++++++++------------- 2 files changed, 34 insertions(+), 37 deletions(-) diff --git a/bayes.py b/bayes.py index 54bb9878..6752762a 100644 --- a/bayes.py +++ b/bayes.py @@ -4,56 +4,57 @@ from math import log, inf from functools import cache from typing import List +"""After training, model_dict is a global variable which is accessible inside this module""" + @cache def tokenize(text): return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")] if len(y)] def train(dataset): - count_of_word_by_category = {} + model_dict = {} + count_of_word_by_outcome = {} num_data_points = len(dataset) - count_of_data_points_in_category = Counter() + count_of_data_points_with_outcome = Counter() for point in dataset: name = point['name'] classification = point['classification'] - count_of_data_points_in_category[classification] += 1 - if classification not in count_of_word_by_category: - count_of_word_by_category[classification] = Counter() + count_of_data_points_with_outcome[classification] += 1 + if classification not in count_of_word_by_outcome: + count_of_word_by_outcome[classification] = Counter() words = set(tokenize(point['contents'])) for word in words: - count_of_word_by_category[classification][word] += 1 + count_of_word_by_outcome[classification][word] += 1 + + model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome + model_dict['num_data_points'] = num_data_points + model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome - return (count_of_word_by_category, num_data_points, count_of_data_points_in_category) + return model_dict """ TODO - Implement the following functions. - -After training (which will be run before your code), the following global -variables are available: - count_of_word_by_category[category][word] = Total number of documents in the - category 'category' in which this word appears - - num_data_points = Total number of documents in the data set - - count_of_data_points_in_category[category] = Total number of documents in - the category 'category' +In each of these functions, you can use the MODEL variable which is a dictionary which has three members: + MODEL['count_of_word_by_outcome'][outcome][word] = Total number of documents in the category 'outcome' in which this word appears + MODEL['num_data_points'] = Total number of documents in the data set + MODEL['count_of_data_points_with_outcome'][outcome] = Total number of documents in the category 'outcome' """ @cache -def pr_category(category : str) : # Pr(category) +def pr_outcome(outcome : str) : # Pr(outcome) return 0 @cache -def pr_word_given_category(word : str, category : str, num_words_in_document : int): # Pr(word | category) +def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome) return 0 -def pr_category_given_words(words : List[str], category : str): # Pr(category | words) +def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words) return 0 -def predict(categories, words): +def predict(outcomes, words): best = None best_likelihood = -inf - for category in categories: - pr = pr_category_given_words(words, category) + for outcome in outcomes: + pr = pr_outcome_given_words(words, outcome) if pr > best_likelihood: - best = category + best = outcome best_likelihood = pr return best diff --git a/main.py b/main.py index b15ff249..191ba508 100644 --- a/main.py +++ b/main.py @@ -8,29 +8,25 @@ VALIDATE = 'data/validate.json' train = json.loads(open(TRAIN).read()) validate = json.loads(open(VALIDATE).read()) -def test(dataset, categories): +def test(dataset, outcomes): answers = dict([x.split(" ") for x in open(dataset + "_validate.txt").read().split("\n")[:-1]]) - ( - bayes.count_of_word_by_category, - bayes.num_data_points, - bayes.count_of_data_points_in_category - ) = bayes.train(train[dataset]) + bayes.model_dict = bayes.train(train[dataset]) - correct_by_category = Counter() - incorrect_by_category = Counter() + correct_by_outcome = Counter() + incorrect_by_outcome = Counter() for point in validate[dataset]: words = set(bayes.tokenize(point['contents'])) - prediction = bayes.predict(categories, words) + prediction = bayes.predict(outcomes, words) answer = answers[point['name']] if prediction == answer: - correct_by_category[answer] += 1 + correct_by_outcome[answer] += 1 else: - incorrect_by_category[answer] += 1 + incorrect_by_outcome[answer] += 1 - print(correct_by_category) - print(incorrect_by_category) + print(correct_by_outcome) + print(incorrect_by_outcome) test('tweets', set(['positive', 'negative'])) test('emails', set(['spam', 'ham'])) -- GitLab