import math
import re
import datetime
import time
text1 = "Baseball wins a score in the season."text2 = "In the season,soccer loss a score."def compute_cosine(text_a, text_b):
# 找单词及词频words1 = text_a.split(' ')
words2 = text_b.split(' ')
# print(words1)words1_dict = {}
words2_dict = {}
for word in words1:
# word = word.strip(",.?!;")word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
# print(word)if word != '' and word in words1_dict: # 这里改动了
num = words1_dict[word]
words1_dict[word] = num + 1elif word != '':
words1_dict[word] = 1else:
continuefor word in words2:
# word = word.strip(",.?!;")word = re.sub('[^a-zA-Z]', '', word)
word = word.lower()
if word != '' and word in words2_dict:
num = words2_dict[word]
words2_dict[word] = num + 1elif word != '':
words2_dict[word] = 1else:
continueprint(words1_dict)
print(words2_dict)
# 排序dic1 = sorted(words1_dict.items(), key=lambda asd: asd[1], reverse=True)
dic2 = sorted(words2_dict.items(), key=lambda asd: asd[1], reverse=True)
print(dic1)
print(dic2)
# 得到词向量words_key = []for i in range(len(dic1)):
words_key.append(dic1[i][0]) # 向数组中添加元素
for i in range(len(dic2)):
if dic2[i][0] in words_key:
# print 'has_key', dic2[i][0]passelse: # 合并
words_key.append(dic2[i][0])
# print(words_key)vect1 = []vect2 = []for word in words_key:
if word in words1_dict:
vect1.append(words1_dict[word])
else:
vect1.append(0)
if word in words2_dict:
vect2.append(words2_dict[word])
else:
vect2.append(0)
print(vect1)
print(vect2)
# 计算余弦相似度sum = 0sq1 = 0sq2 = 0for i in range(len(vect1)):
sum += vect1[i] * vect2[i]
sq1 += pow(vect1[i], 2)
sq2 += pow(vect2[i], 2)
try:
result = round(float(sum) / (math.sqrt(sq1) * math.sqrt(sq2)), 2)
except ZeroDivisionError:
result = 0.0return result
if __name__ == '__main__':
result=compute_cosine(text1, text2)
print(result)