diff --git a/file_load.py b/file_load.py index 4d34680..ff4baba 100644 --- a/file_load.py +++ b/file_load.py @@ -2,30 +2,16 @@ import json import os import random +import pandas +import numpy +# import matplotlib +from sklearn.metrics.pairwise import cosine_similarity + from openai import OpenAI -client = OpenAI() os.environ["OPENAI_API_KEY"]= "sk-PRJ811XeKzEy20Ug3dA98a34Af8b40B5816dE15503D33599" os.environ["OPENAI_BASE_URL"]= "http://154.9.28.247:3000/v1/" - - - -from sklearn.metrics.pairwise import cosine_similarity - -def calc_similarity(scale): - item=[] - vec=[] - for i in scale: - item.append(i) - vec.append(client.embeddings.create( - input=scale[i], model="text-embedding-3-small" # nomic-embed-text text-embedding-3-small - ).data[0].embedding) - simi=cosine_similarity(vec) - que=[] - for i,v in enumerate(simi): - for j in range(0,i): - que.append({"from":item[j], "to":item[i], "similarity":simi[i][j]}) - return sorted(que, key = lambda t : t["similarity"], reverse=True) +client = OpenAI() def batch(): scales = os.listdir("Scales") @@ -48,17 +34,35 @@ def old_type(str): with open(str,"w") as file: file.write(json.dumps(new)) -def calc_similarity(force:bool = False): +def calc_similarity(scale): + item=[] + vec=[] + for i in scale: + item.append(i) + vec.append(client.embeddings.create( + input=scale[i], model="text-embedding-3-small" # nomic-embed-text text-embedding-3-small + ).data[0].embedding) + simi=cosine_similarity(vec) + que=[] + for i,v in enumerate(simi): + for j in range(0,i): + que.append({"from":item[j], "to":item[i], "similarity":simi[i][j]}) + return que + +def similarity(force:bool = False,sort:bool=True): if force or os.path.getsize("Temp/items.json") == 0: - que=embedding(batch()) + que=calc_similarity(batch()) with open("Temp/items.json","w") as items: items.write(json.dumps(que)) else: with open("Temp/items.json","r") as items: que = json.load(items) - return que + if sort: + return sorted(que, key = lambda t : t["similarity"], reverse=True) + else: + return que -def data(): +def make_data(): s="" item = batch() for i in item: @@ -71,3 +75,22 @@ def data(): s+='\n' with open("Temp/data.csv","w") as data: data.write(s) + +def corelation(sort:bool=True): + data = pandas.read_csv("data.csv") + que=[] + for i in data: + for j in data: + try: + if(i != j): + # que[i,j]["psr"]=data[i].corr(data[j]) + que.append({"from":j,"to":i,"psr":data[i].corr(data[j])}) + else: + pass + except: + pass + if sort: + return sorted(que,key = lambda t : abs(t["psr"]), reverse=True) + else: + return que + diff --git a/main.py b/main.py index a3f70ab..685f249 100644 --- a/main.py +++ b/main.py @@ -3,9 +3,30 @@ import file_load import json import os +import numpy + +# file_load.make_data() similarity = file_load.similarity() -file_load.data() +corelation = file_load.corelation() + +table = {} + +for i in corelation: + table[i["from"],i["to"]]=i["psr"] + +x=[] +y=[] for i in similarity: - print(i) + x.append(abs(table[i["from"],i["to"]])) + y.append(i["similarity"]) + +print(numpy.corrcoef(x,y)[0,1]) + +s="similarity, corelation\n" +for i in similarity: + s+=str(i["similarity"])+','+str(table[i["from"],i["to"]])+'\n' +with open("Temp/point.csv","w") as point: + point.write(s) +