计算向量相似度和皮尔森r的相关

This commit is contained in:
mxr612 2024-05-14 22:46:44 +08:00
parent e4b0d3e583
commit 2a2e5f9d27
2 changed files with 70 additions and 26 deletions

View File

@ -2,30 +2,16 @@ import json
import os import os
import random import random
import pandas
import numpy
# import matplotlib
from sklearn.metrics.pairwise import cosine_similarity
from openai import OpenAI from openai import OpenAI
client = OpenAI()
os.environ["OPENAI_API_KEY"]= "sk-PRJ811XeKzEy20Ug3dA98a34Af8b40B5816dE15503D33599" os.environ["OPENAI_API_KEY"]= "sk-PRJ811XeKzEy20Ug3dA98a34Af8b40B5816dE15503D33599"
os.environ["OPENAI_BASE_URL"]= "http://154.9.28.247:3000/v1/" os.environ["OPENAI_BASE_URL"]= "http://154.9.28.247:3000/v1/"
client = OpenAI()
from sklearn.metrics.pairwise import cosine_similarity
def calc_similarity(scale):
item=[]
vec=[]
for i in scale:
item.append(i)
vec.append(client.embeddings.create(
input=scale[i], model="text-embedding-3-small" # nomic-embed-text text-embedding-3-small
).data[0].embedding)
simi=cosine_similarity(vec)
que=[]
for i,v in enumerate(simi):
for j in range(0,i):
que.append({"from":item[j], "to":item[i], "similarity":simi[i][j]})
return sorted(que, key = lambda t : t["similarity"], reverse=True)
def batch(): def batch():
scales = os.listdir("Scales") scales = os.listdir("Scales")
@ -48,17 +34,35 @@ def old_type(str):
with open(str,"w") as file: with open(str,"w") as file:
file.write(json.dumps(new)) file.write(json.dumps(new))
def calc_similarity(force:bool = False): def calc_similarity(scale):
item=[]
vec=[]
for i in scale:
item.append(i)
vec.append(client.embeddings.create(
input=scale[i], model="text-embedding-3-small" # nomic-embed-text text-embedding-3-small
).data[0].embedding)
simi=cosine_similarity(vec)
que=[]
for i,v in enumerate(simi):
for j in range(0,i):
que.append({"from":item[j], "to":item[i], "similarity":simi[i][j]})
return que
def similarity(force:bool = False,sort:bool=True):
if force or os.path.getsize("Temp/items.json") == 0: if force or os.path.getsize("Temp/items.json") == 0:
que=embedding(batch()) que=calc_similarity(batch())
with open("Temp/items.json","w") as items: with open("Temp/items.json","w") as items:
items.write(json.dumps(que)) items.write(json.dumps(que))
else: else:
with open("Temp/items.json","r") as items: with open("Temp/items.json","r") as items:
que = json.load(items) que = json.load(items)
return que if sort:
return sorted(que, key = lambda t : t["similarity"], reverse=True)
else:
return que
def data(): def make_data():
s="" s=""
item = batch() item = batch()
for i in item: for i in item:
@ -71,3 +75,22 @@ def data():
s+='\n' s+='\n'
with open("Temp/data.csv","w") as data: with open("Temp/data.csv","w") as data:
data.write(s) data.write(s)
def corelation(sort:bool=True):
data = pandas.read_csv("data.csv")
que=[]
for i in data:
for j in data:
try:
if(i != j):
# que[i,j]["psr"]=data[i].corr(data[j])
que.append({"from":j,"to":i,"psr":data[i].corr(data[j])})
else:
pass
except:
pass
if sort:
return sorted(que,key = lambda t : abs(t["psr"]), reverse=True)
else:
return que

25
main.py
View File

@ -3,9 +3,30 @@ import file_load
import json import json
import os import os
import numpy
# file_load.make_data()
similarity = file_load.similarity() similarity = file_load.similarity()
file_load.data() corelation = file_load.corelation()
table = {}
for i in corelation:
table[i["from"],i["to"]]=i["psr"]
x=[]
y=[]
for i in similarity: for i in similarity:
print(i) x.append(abs(table[i["from"],i["to"]]))
y.append(i["similarity"])
print(numpy.corrcoef(x,y)[0,1])
s="similarity, corelation\n"
for i in similarity:
s+=str(i["similarity"])+','+str(table[i["from"],i["to"]])+'\n'
with open("Temp/point.csv","w") as point:
point.write(s)