Commit fa240df0 authored by Yaksher's avatar Yaksher
Browse files

Revert "Rename outcomes to categories; unpack model_dict global"

This reverts commit eb1b7980.
parent eb1b7980
No related merge requests found
Showing with 34 additions and 37 deletions
+34 -37
......@@ -4,56 +4,57 @@ from math import log, inf
from functools import cache
from typing import List
"""After training, model_dict is a global variable which is accessible inside this module"""
@cache
def tokenize(text):
return [y for y in [re.sub('[^a-z0-9]', '', x) for x in text.lower().split(" ")] if len(y)]
def train(dataset):
count_of_word_by_category = {}
model_dict = {}
count_of_word_by_outcome = {}
num_data_points = len(dataset)
count_of_data_points_in_category = Counter()
count_of_data_points_with_outcome = Counter()
for point in dataset:
name = point['name']
classification = point['classification']
count_of_data_points_in_category[classification] += 1
if classification not in count_of_word_by_category:
count_of_word_by_category[classification] = Counter()
count_of_data_points_with_outcome[classification] += 1
if classification not in count_of_word_by_outcome:
count_of_word_by_outcome[classification] = Counter()
words = set(tokenize(point['contents']))
for word in words:
count_of_word_by_category[classification][word] += 1
count_of_word_by_outcome[classification][word] += 1
model_dict['count_of_word_by_outcome'] = count_of_word_by_outcome
model_dict['num_data_points'] = num_data_points
model_dict['count_of_data_points_with_outcome'] = count_of_data_points_with_outcome
return (count_of_word_by_category, num_data_points, count_of_data_points_in_category)
return model_dict
"""
TODO - Implement the following functions.
After training (which will be run before your code), the following global
variables are available:
count_of_word_by_category[category][word] = Total number of documents in the
category 'category' in which this word appears
num_data_points = Total number of documents in the data set
count_of_data_points_in_category[category] = Total number of documents in
the category 'category'
In each of these functions, you can use the MODEL variable which is a dictionary which has three members:
MODEL['count_of_word_by_outcome'][outcome][word] = Total number of documents in the category 'outcome' in which this word appears
MODEL['num_data_points'] = Total number of documents in the data set
MODEL['count_of_data_points_with_outcome'][outcome] = Total number of documents in the category 'outcome'
"""
@cache
def pr_category(category : str) : # Pr(category)
def pr_outcome(outcome : str) : # Pr(outcome)
return 0
@cache
def pr_word_given_category(word : str, category : str, num_words_in_document : int): # Pr(word | category)
def pr_word_given_outcome(word : str, outcome : str, num_words_in_document : int): # Pr(word | outcome)
return 0
def pr_category_given_words(words : List[str], category : str): # Pr(category | words)
def pr_outcome_given_words(words : List[str], outcome : str): # Pr(outcome | words)
return 0
def predict(categories, words):
def predict(outcomes, words):
best = None
best_likelihood = -inf
for category in categories:
pr = pr_category_given_words(words, category)
for outcome in outcomes:
pr = pr_outcome_given_words(words, outcome)
if pr > best_likelihood:
best = category
best = outcome
best_likelihood = pr
return best
......@@ -8,29 +8,25 @@ VALIDATE = 'data/validate.json'
train = json.loads(open(TRAIN).read())
validate = json.loads(open(VALIDATE).read())
def test(dataset, categories):
def test(dataset, outcomes):
answers = dict([x.split(" ") for x in open(dataset + "_validate.txt").read().split("\n")[:-1]])
(
bayes.count_of_word_by_category,
bayes.num_data_points,
bayes.count_of_data_points_in_category
) = bayes.train(train[dataset])
bayes.model_dict = bayes.train(train[dataset])
correct_by_category = Counter()
incorrect_by_category = Counter()
correct_by_outcome = Counter()
incorrect_by_outcome = Counter()
for point in validate[dataset]:
words = set(bayes.tokenize(point['contents']))
prediction = bayes.predict(categories, words)
prediction = bayes.predict(outcomes, words)
answer = answers[point['name']]
if prediction == answer:
correct_by_category[answer] += 1
correct_by_outcome[answer] += 1
else:
incorrect_by_category[answer] += 1
incorrect_by_outcome[answer] += 1
print(correct_by_category)
print(incorrect_by_category)
print(correct_by_outcome)
print(incorrect_by_outcome)
test('tweets', set(['positive', 'negative']))
test('emails', set(['spam', 'ham']))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment