Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
cs13-23sp
project02
Commits
cb3610f7
Commit
cb3610f7
authored
1 year ago
by
Yaksher
Browse files
Options
Download
Email Patches
Plain Diff
Rename outcomes to categories; unpack model_dict global
parent
fa240df0
master
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
bayes.py
+20
-24
bayes.py
main.py
+9
-9
main.py
with
29 additions
and
33 deletions
+29
-33
bayes.py
View file @
cb3610f7
...
...
@@ -11,50 +11,46 @@ def tokenize(text):
return
[
y
for
y
in
[
re
.
sub
(
'[^a-z0-9]'
,
''
,
x
)
for
x
in
text
.
lower
().
split
(
" "
)]
if
len
(
y
)]
def
train
(
dataset
):
model_dict
=
{}
count_of_word_by_outcome
=
{}
global
count_of_word_by_category
global
num_data_points
global
num_data_points_in_category
count_of_word_by_category
=
{}
num_data_points
=
len
(
dataset
)
count_of
_data_points_
with_outcome
=
Counter
()
num
_data_points_
in_category
=
Counter
()
for
point
in
dataset
:
name
=
point
[
'name'
]
classification
=
point
[
'classification'
]
count_of
_data_points_
with_outcome
[
classification
]
+=
1
if
classification
not
in
count_of_word_by_
outcome
:
count_of_word_by_
outcome
[
classification
]
=
Counter
()
num
_data_points_
in_category
[
classification
]
+=
1
if
classification
not
in
count_of_word_by_
category
:
count_of_word_by_
category
[
classification
]
=
Counter
()
words
=
set
(
tokenize
(
point
[
'contents'
]))
for
word
in
words
:
count_of_word_by_outcome
[
classification
][
word
]
+=
1
model_dict
[
'count_of_word_by_outcome'
]
=
count_of_word_by_outcome
model_dict
[
'num_data_points'
]
=
num_data_points
model_dict
[
'count_of_data_points_with_outcome'
]
=
count_of_data_points_with_outcome
return
model_dict
count_of_word_by_category
[
classification
][
word
]
+=
1
"""
TODO - Implement the following functions.
In each of these functions,
you c
an use the MODEL variable which is a dictionary which has three members
:
MODEL['
count_of_word_by_
outcome'][outcome
][word] = Total number of documents in the category '
outcome
' in which this word appears
MODEL['
num_data_points
']
= Total number of documents in the data set
MODEL['count_of_data_points_with_outcome'][outcome
] = Total number of documents in the category '
outcome
'
After training (which is run before
you
r
c
ode), the following 3 global variables are available
:
count_of_word_by_
category[category
][word] = Total number of documents in the category '
category
' in which this word appears
num_data_points = Total number of documents in the data set
num_data_points_in_category[category
] = Total number of documents in the category '
category
'
"""
@
cache
def
pr_
outcome
(
outcome
:
str
)
:
# Pr(
outcome
)
def
pr_
category
(
category
:
str
)
:
# Pr(
category
)
return
0
@
cache
def
pr_word_given_
outcome
(
word
:
str
,
outcome
:
str
,
num_words_in_document
:
int
):
# Pr(word |
outcome
)
def
pr_word_given_
category
(
word
:
str
,
category
:
str
,
num_words_in_document
:
int
):
# Pr(word |
category
)
return
0
def
pr_
outcome
_given_words
(
words
:
List
[
str
],
outcome
:
str
):
# Pr(
outcome
| words)
def
pr_
category
_given_words
(
words
:
List
[
str
],
category
:
str
):
# Pr(
category
| words)
return
0
def
predict
(
outcom
es
,
words
):
def
predict
(
categori
es
,
words
):
best
=
None
best_likelihood
=
-
inf
for
outcome
in
outcom
es
:
pr
=
pr_
outcome
_given_words
(
words
,
outcome
)
for
category
in
categori
es
:
pr
=
pr_
category
_given_words
(
words
,
category
)
if
pr
>
best_likelihood
:
best
=
outcome
best
=
category
best_likelihood
=
pr
return
best
This diff is collapsed.
Click to expand it.
main.py
View file @
cb3610f7
...
...
@@ -8,25 +8,25 @@ VALIDATE = 'data/validate.json'
train
=
json
.
loads
(
open
(
TRAIN
).
read
())
validate
=
json
.
loads
(
open
(
VALIDATE
).
read
())
def
test
(
dataset
,
outcom
es
):
def
test
(
dataset
,
categori
es
):
answers
=
dict
([
x
.
split
(
" "
)
for
x
in
open
(
dataset
+
"_validate.txt"
).
read
().
split
(
"
\n
"
)[:
-
1
]])
bayes
.
model_dict
=
bayes
.
train
(
train
[
dataset
])
bayes
.
train
(
train
[
dataset
])
correct_by_
outcome
=
Counter
()
incorrect_by_
outcome
=
Counter
()
correct_by_
category
=
Counter
()
incorrect_by_
category
=
Counter
()
for
point
in
validate
[
dataset
]:
words
=
set
(
bayes
.
tokenize
(
point
[
'contents'
]))
prediction
=
bayes
.
predict
(
outcom
es
,
words
)
prediction
=
bayes
.
predict
(
categori
es
,
words
)
answer
=
answers
[
point
[
'name'
]]
if
prediction
==
answer
:
correct_by_
outcome
[
answer
]
+=
1
correct_by_
category
[
answer
]
+=
1
else
:
incorrect_by_
outcome
[
answer
]
+=
1
incorrect_by_
category
[
answer
]
+=
1
print
(
correct_by_
outcome
)
print
(
incorrect_by_
outcome
)
print
(
correct_by_
category
)
print
(
incorrect_by_
category
)
test
(
'tweets'
,
set
([
'positive'
,
'negative'
]))
test
(
'emails'
,
set
([
'spam'
,
'ham'
]))
...
...
This diff is collapsed.
Click to expand it.
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment
Menu
Projects
Groups
Snippets
Help