Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
cs13-23sp
project02
Commits
fa240df0
Commit
fa240df0
authored
1 year ago
by
Yaksher
Browse files
Options
Download
Email Patches
Plain Diff
Revert "Rename outcomes to categories; unpack model_dict global"
This reverts commit
eb1b7980
.
parent
eb1b7980
master
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
bayes.py
+25
-24
bayes.py
main.py
+9
-13
main.py
with
34 additions
and
37 deletions
+34
-37
bayes.py
View file @
fa240df0
...
...
@@ -4,56 +4,57 @@ from math import log, inf
from
functools
import
cache
from
typing
import
List
"""After training, model_dict is a global variable which is accessible inside this module"""
@
cache
def
tokenize
(
text
):
return
[
y
for
y
in
[
re
.
sub
(
'[^a-z0-9]'
,
''
,
x
)
for
x
in
text
.
lower
().
split
(
" "
)]
if
len
(
y
)]
def
train
(
dataset
):
count_of_word_by_category
=
{}
model_dict
=
{}
count_of_word_by_outcome
=
{}
num_data_points
=
len
(
dataset
)
count_of_data_points_
in_category
=
Counter
()
count_of_data_points_
with_outcome
=
Counter
()
for
point
in
dataset
:
name
=
point
[
'name'
]
classification
=
point
[
'classification'
]
count_of_data_points_
in_category
[
classification
]
+=
1
if
classification
not
in
count_of_word_by_
category
:
count_of_word_by_
category
[
classification
]
=
Counter
()
count_of_data_points_
with_outcome
[
classification
]
+=
1
if
classification
not
in
count_of_word_by_
outcome
:
count_of_word_by_
outcome
[
classification
]
=
Counter
()
words
=
set
(
tokenize
(
point
[
'contents'
]))
for
word
in
words
:
count_of_word_by_category
[
classification
][
word
]
+=
1
count_of_word_by_outcome
[
classification
][
word
]
+=
1
model_dict
[
'count_of_word_by_outcome'
]
=
count_of_word_by_outcome
model_dict
[
'num_data_points'
]
=
num_data_points
model_dict
[
'count_of_data_points_with_outcome'
]
=
count_of_data_points_with_outcome
return
(
count_of_word_by_category
,
num_data_points
,
count_of_data_points_in_category
)
return
model_dict
"""
TODO - Implement the following functions.
After training (which will be run before your code), the following global
variables are available:
count_of_word_by_category[category][word] = Total number of documents in the
category 'category' in which this word appears
num_data_points = Total number of documents in the data set
count_of_data_points_in_category[category] = Total number of documents in
the category 'category'
In each of these functions, you can use the MODEL variable which is a dictionary which has three members:
MODEL['count_of_word_by_outcome'][outcome][word] = Total number of documents in the category 'outcome' in which this word appears
MODEL['num_data_points'] = Total number of documents in the data set
MODEL['count_of_data_points_with_outcome'][outcome] = Total number of documents in the category 'outcome'
"""
@
cache
def
pr_
category
(
category
:
str
)
:
# Pr(
category
)
def
pr_
outcome
(
outcome
:
str
)
:
# Pr(
outcome
)
return
0
@
cache
def
pr_word_given_
category
(
word
:
str
,
category
:
str
,
num_words_in_document
:
int
):
# Pr(word |
category
)
def
pr_word_given_
outcome
(
word
:
str
,
outcome
:
str
,
num_words_in_document
:
int
):
# Pr(word |
outcome
)
return
0
def
pr_
category
_given_words
(
words
:
List
[
str
],
category
:
str
):
# Pr(
category
| words)
def
pr_
outcome
_given_words
(
words
:
List
[
str
],
outcome
:
str
):
# Pr(
outcome
| words)
return
0
def
predict
(
categori
es
,
words
):
def
predict
(
outcom
es
,
words
):
best
=
None
best_likelihood
=
-
inf
for
category
in
categori
es
:
pr
=
pr_
category
_given_words
(
words
,
category
)
for
outcome
in
outcom
es
:
pr
=
pr_
outcome
_given_words
(
words
,
outcome
)
if
pr
>
best_likelihood
:
best
=
category
best
=
outcome
best_likelihood
=
pr
return
best
This diff is collapsed.
Click to expand it.
main.py
View file @
fa240df0
...
...
@@ -8,29 +8,25 @@ VALIDATE = 'data/validate.json'
train
=
json
.
loads
(
open
(
TRAIN
).
read
())
validate
=
json
.
loads
(
open
(
VALIDATE
).
read
())
def
test
(
dataset
,
categori
es
):
def
test
(
dataset
,
outcom
es
):
answers
=
dict
([
x
.
split
(
" "
)
for
x
in
open
(
dataset
+
"_validate.txt"
).
read
().
split
(
"
\n
"
)[:
-
1
]])
(
bayes
.
count_of_word_by_category
,
bayes
.
num_data_points
,
bayes
.
count_of_data_points_in_category
)
=
bayes
.
train
(
train
[
dataset
])
bayes
.
model_dict
=
bayes
.
train
(
train
[
dataset
])
correct_by_
category
=
Counter
()
incorrect_by_
category
=
Counter
()
correct_by_
outcome
=
Counter
()
incorrect_by_
outcome
=
Counter
()
for
point
in
validate
[
dataset
]:
words
=
set
(
bayes
.
tokenize
(
point
[
'contents'
]))
prediction
=
bayes
.
predict
(
categori
es
,
words
)
prediction
=
bayes
.
predict
(
outcom
es
,
words
)
answer
=
answers
[
point
[
'name'
]]
if
prediction
==
answer
:
correct_by_
category
[
answer
]
+=
1
correct_by_
outcome
[
answer
]
+=
1
else
:
incorrect_by_
category
[
answer
]
+=
1
incorrect_by_
outcome
[
answer
]
+=
1
print
(
correct_by_
category
)
print
(
incorrect_by_
category
)
print
(
correct_by_
outcome
)
print
(
incorrect_by_
outcome
)
test
(
'tweets'
,
set
([
'positive'
,
'negative'
]))
test
(
'emails'
,
set
([
'spam'
,
'ham'
]))
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help