计算向量相似度和皮尔森r的相关
This commit is contained in:
parent
e4b0d3e583
commit
2a2e5f9d27
69
file_load.py
69
file_load.py
@ -2,30 +2,16 @@ import json
|
|||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
import numpy
|
||||||
|
# import matplotlib
|
||||||
|
from sklearn.metrics.pairwise import cosine_similarity
|
||||||
|
|
||||||
from openai import OpenAI
|
from openai import OpenAI
|
||||||
client = OpenAI()
|
|
||||||
|
|
||||||
os.environ["OPENAI_API_KEY"]= "sk-PRJ811XeKzEy20Ug3dA98a34Af8b40B5816dE15503D33599"
|
os.environ["OPENAI_API_KEY"]= "sk-PRJ811XeKzEy20Ug3dA98a34Af8b40B5816dE15503D33599"
|
||||||
os.environ["OPENAI_BASE_URL"]= "http://154.9.28.247:3000/v1/"
|
os.environ["OPENAI_BASE_URL"]= "http://154.9.28.247:3000/v1/"
|
||||||
|
client = OpenAI()
|
||||||
|
|
||||||
|
|
||||||
from sklearn.metrics.pairwise import cosine_similarity
|
|
||||||
|
|
||||||
def calc_similarity(scale):
|
|
||||||
item=[]
|
|
||||||
vec=[]
|
|
||||||
for i in scale:
|
|
||||||
item.append(i)
|
|
||||||
vec.append(client.embeddings.create(
|
|
||||||
input=scale[i], model="text-embedding-3-small" # nomic-embed-text text-embedding-3-small
|
|
||||||
).data[0].embedding)
|
|
||||||
simi=cosine_similarity(vec)
|
|
||||||
que=[]
|
|
||||||
for i,v in enumerate(simi):
|
|
||||||
for j in range(0,i):
|
|
||||||
que.append({"from":item[j], "to":item[i], "similarity":simi[i][j]})
|
|
||||||
return sorted(que, key = lambda t : t["similarity"], reverse=True)
|
|
||||||
|
|
||||||
def batch():
|
def batch():
|
||||||
scales = os.listdir("Scales")
|
scales = os.listdir("Scales")
|
||||||
@ -48,17 +34,35 @@ def old_type(str):
|
|||||||
with open(str,"w") as file:
|
with open(str,"w") as file:
|
||||||
file.write(json.dumps(new))
|
file.write(json.dumps(new))
|
||||||
|
|
||||||
def calc_similarity(force:bool = False):
|
def calc_similarity(scale):
|
||||||
|
item=[]
|
||||||
|
vec=[]
|
||||||
|
for i in scale:
|
||||||
|
item.append(i)
|
||||||
|
vec.append(client.embeddings.create(
|
||||||
|
input=scale[i], model="text-embedding-3-small" # nomic-embed-text text-embedding-3-small
|
||||||
|
).data[0].embedding)
|
||||||
|
simi=cosine_similarity(vec)
|
||||||
|
que=[]
|
||||||
|
for i,v in enumerate(simi):
|
||||||
|
for j in range(0,i):
|
||||||
|
que.append({"from":item[j], "to":item[i], "similarity":simi[i][j]})
|
||||||
|
return que
|
||||||
|
|
||||||
|
def similarity(force:bool = False,sort:bool=True):
|
||||||
if force or os.path.getsize("Temp/items.json") == 0:
|
if force or os.path.getsize("Temp/items.json") == 0:
|
||||||
que=embedding(batch())
|
que=calc_similarity(batch())
|
||||||
with open("Temp/items.json","w") as items:
|
with open("Temp/items.json","w") as items:
|
||||||
items.write(json.dumps(que))
|
items.write(json.dumps(que))
|
||||||
else:
|
else:
|
||||||
with open("Temp/items.json","r") as items:
|
with open("Temp/items.json","r") as items:
|
||||||
que = json.load(items)
|
que = json.load(items)
|
||||||
|
if sort:
|
||||||
|
return sorted(que, key = lambda t : t["similarity"], reverse=True)
|
||||||
|
else:
|
||||||
return que
|
return que
|
||||||
|
|
||||||
def data():
|
def make_data():
|
||||||
s=""
|
s=""
|
||||||
item = batch()
|
item = batch()
|
||||||
for i in item:
|
for i in item:
|
||||||
@ -71,3 +75,22 @@ def data():
|
|||||||
s+='\n'
|
s+='\n'
|
||||||
with open("Temp/data.csv","w") as data:
|
with open("Temp/data.csv","w") as data:
|
||||||
data.write(s)
|
data.write(s)
|
||||||
|
|
||||||
|
def corelation(sort:bool=True):
|
||||||
|
data = pandas.read_csv("data.csv")
|
||||||
|
que=[]
|
||||||
|
for i in data:
|
||||||
|
for j in data:
|
||||||
|
try:
|
||||||
|
if(i != j):
|
||||||
|
# que[i,j]["psr"]=data[i].corr(data[j])
|
||||||
|
que.append({"from":j,"to":i,"psr":data[i].corr(data[j])})
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
if sort:
|
||||||
|
return sorted(que,key = lambda t : abs(t["psr"]), reverse=True)
|
||||||
|
else:
|
||||||
|
return que
|
||||||
|
|
||||||
|
25
main.py
25
main.py
@ -3,9 +3,30 @@ import file_load
|
|||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import numpy
|
||||||
|
|
||||||
|
# file_load.make_data()
|
||||||
|
|
||||||
similarity = file_load.similarity()
|
similarity = file_load.similarity()
|
||||||
file_load.data()
|
corelation = file_load.corelation()
|
||||||
|
|
||||||
|
table = {}
|
||||||
|
|
||||||
|
for i in corelation:
|
||||||
|
table[i["from"],i["to"]]=i["psr"]
|
||||||
|
|
||||||
|
x=[]
|
||||||
|
y=[]
|
||||||
|
|
||||||
for i in similarity:
|
for i in similarity:
|
||||||
print(i)
|
x.append(abs(table[i["from"],i["to"]]))
|
||||||
|
y.append(i["similarity"])
|
||||||
|
|
||||||
|
print(numpy.corrcoef(x,y)[0,1])
|
||||||
|
|
||||||
|
s="similarity, corelation\n"
|
||||||
|
for i in similarity:
|
||||||
|
s+=str(i["similarity"])+','+str(table[i["from"],i["to"]])+'\n'
|
||||||
|
with open("Temp/point.csv","w") as point:
|
||||||
|
point.write(s)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user