[code] 2๋‹จ์› ์‹ค์Šต์ฝ”๋“œ

์ˆ˜์ • ๋ชฉ๋ก
1. (9์ชฝ) ์ž๋ฃŒ ๋ฌธ์ž์—ด

a1 = 'I like math. I like science as well, especially physics.'
a2 = 'I prefer math and geometry to social science. And I donโ€™t like sports.'
a3 = 'Iโ€™m afraid but I donโ€™t like math nor science.'
a4 = 'I like cooking and my favorite food is Korean food.'

2. (9์ชฝ) ์ „์ฒ˜๋ฆฌ ์ž‘์—… ์ฝ”๋“œ

data_list = [a1, a2, a3, a4]
mining_list = []
for data in data_list :
data = data.replace("โ€™", "")
data = data.replace(",","")
data = data.replace(".", "")
data = data.lower()
mining_list.append(data)

for new_data in mining_list :
print(new_data)

3. (10์ชฝ) ๋ฒกํ„ฐํ™” ์ฝ”๋“œ

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(corpus)

vect_dict = vect.vocabulary_

word_list = []
index_list = []

for tup in sorted(vect_dict.items()) :
word_list.append(tup[0])
index_list.append(tup[1])
print(sorted(vect_dict.items()), end=' ')



4. (11์ชฝ) ๊ฐ ๋ฌธ์žฅ ๋ฒกํ„ฐ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ๊ณผ์ • ์ฝ”๋“œ

vectors = []

u = a1 + ' ' + a2 + ' ' + a3 + ' ' + a4

for i in corpus :
a = vect.transform([i]).toarray()
vectors.append(a)
u_ = vect.transform([u]).toarray()
vectors.append(u_)

namelist = ['A1', 'A2', 'A3', 'A4', 'U' ]
print('---------- ๋‹จ์–ด ์‚ฌ์ „ ----------')
print(sorted(vect_dict.items()), end = ' ')
print("""

""")
print('---------- ๋ฒกํ„ฐ ----------')
for name, vector in zip(namelist, vectors) :
print(name, ':', vector)



5. (12์ชฝ) ํ‘œ๋กœ ํ‘œํ˜„ํ•˜๋Š” ๊ณผ์ • ์ฝ”๋“œ

import pandas as pd

con_dict = {
'word' : word_list,
'A1' : vectors[0][0],
'A2' : vectors[1][0],
'A3' : vectors[2][0],
'A4' : vectors[3][0],
'U' : vectors[4][0]
}
a = pd.DataFrame(con_dict)

a


6. (13์ชฝ) ๋ฌธ์„œ ๋นˆ๋„์ˆ˜ ์ถœ๋ ฅ ์ฝ”๋“œ

df_list = []
for k in range(0, len(word_list)) :
df_list.append(len(vectors)-1)

for i in range(0, len(vectors[0][0])) :
for vector in vectors :
if vector[0][i] == 0 :
df_list[i] -= 1

print(df_list)


7. (13์ชฝ) ์ƒ๋Œ€๋„์ˆ˜ ์ถœ๋ ฅ ์ฝ”๋“œ


n = len(vectors) - 1

relative_freq_list = [ df/n for df in df_list ]

relative_freq_dict = {
'word' : word_list,
'Relative_Freq' : relative_freq_list
}

temp = pd.DataFrame(relative_freq_dict)
temp

8. (14์ชฝ) ์—ญ๋ฌธ์„œ ๋นˆ๋„์ˆ˜ ์ถœ๋ ฅ ์ฝ”๋“œ

idf_list = [ 1/relative_freq for relative_freq in relative_freq_list]

idf_dict = {
'word' : word_list,
'Relative_Freq' : relative_freq_list,
'Inverse DF' : idf_list
}

temp = pd.DataFrame(idf_dict)
temp


9. (15์ชฝ) TFxIDF ์ถœ๋ ฅ ์ฝ”๋“œ

tfidf_list = [tf*idf for tf, idf in zip(vectors[0][0], idf_list)]

tfidf_dict = {
'word' : word_list,
'TF' : vectors[0][0],
'IDF' : idf_list,
'TFxIDF' : tfidf_list
}

temp = pd.DataFrame(tfidf_dict)

temp


10. (23์ชฝ) ์œ ์‚ฌ๋„ ๋ถ„์„ ์ฝ”๋“œ

import pandas as pd

a1 = 'I like math. I like science as well, especially physics.'
a2 = "I prefer math and geometry to social science. And I don't like sports."
a3 = "Iโ€™m afraid but I donโ€™t like math nor science."
a4 = 'I like cooking and my favorite food is Korean food.'

data_list = [a1, a2, a3, a4]
mining_list = []
for data in data_list :
data = data.replace("'", "")
data = data.replace(",", "")
data = data.replace(".", "")
data = data.lower()
mining_list.append(data)

corpus = [a1, a2, a3, a4] = [new_data for new_data in mining_list]

from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer()
vect.fit(corpus)

vect_dict = vect.vocabulary_

word_list = []
index_list = []

vectors = []

u = a1 + ' '+ a2 + ' '+ a3 + ' '+ a4

for i in corpus :
a = vect.transform([i]).toarray()
vectors.append(a)

vectors



11. (24์ชฝ) ๋ฆฌ์ŠคํŠธ ๊ฐ„์†Œํ™”

vectors_list = []
for vector_array in vectors :
vectors_list.append(list(vector_array[0]))

[v1,v2,v3,v4] = vectors_list

print(v1,v2,v3,v4)


12. (24์ชฝ) ์œ ํด๋ฆฌ๋“œ ์œ ์‚ฌ๋„ ํ•จ์ˆ˜๋กœ ๋งŒ๋“œ๋Š” ์ฝ”๋“œ

import math

# Euclidean Similarity e(p,q)
def e(p,q) :
val = 0
for k in range(len(p)):
val += (p[k]-q[k])*(p[k]-q[k])
val = math.sqrt(val)
return round(val,2)


13. (25์ชฝ) ์ƒํ˜ธ์œ ์‚ฌ๋„ํ–‰๋ ฌ ์ถœ๋ ฅ, ์‹œ๊ฐํ™” ์ฝ”๋“œ

import seaborn as sns
import matplotlib.pyplot as plt

euclidean_m = []
temp = []

for a in vectors_list :
for b in vectors_list :
temp.append(e(a,b))
euclidean_m.append(temp)
temp = []

print('Euclidean Similarity Table :')
for k in euclidean_m:
print(k)

labels = ['v1', 'v2', 'v3', 'v4']

df = sns.heatmap(euclidean_m, cmap='Reds', annot = True,
xticklabels = labels, yticklabels = labels)

plt.show()


14. (27์ชฝ) ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๋ถ„์„ ์ฝ”๋“œ


# Cosine Similarity c(p,q)
def c(p,q) :
val = 0
upp = 0
down_1 = 0
down_2 = 0
for k in range(len(p)) :
upp += p[k]*q[k]
down_1 += p[k]*p[k]
down_2 += q[k]*q[k]
down_1 = math.sqrt(down_1)
down_2 = math.sqrt(down_2)
val += upp
val /= down_1*down_2
return round(val,2)


15. (27์ชฝ) ์ฝ”์‚ฌ์ธ ์œ ์‚ฌ๋„ ๋ถ„์„ ์ƒํ˜ธ ์œ ์‚ฌ๋„ ํ–‰๋ ฌ ๋ฐ ์‹œ๊ฐํ™” ์ฝ”๋“œ


cosine_m = []
temp = []


for a in vectors_list :
for b in vectors_list :
temp.append(c(a,b))
cosine_m.append(temp)
temp = []

print('Cosine Similarity Table :')
for k in cosine_m:
print(k)

labels = ['v1', 'v2', 'v3', 'v4']

df = sns.heatmap(cosine_m, cmap='Reds', annot = True,
xticklabels = labels, yticklabels = labels)

plt.show()