Spaces:

Princess3
/

python

Runtime error

App Files Files Community

Princess3 commited on Oct 29, 2024

Commit

b65e3b1

verified ·

1 Parent(s): 898a19f

Update 2.py

Browse files

Files changed (1) hide show

2.py +5 -19

2.py CHANGED Viewed

@@ -8,20 +8,6 @@ from tqdm import tqdm
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
-# Set the cache directory path
-cache_dir = '/app/cache'
-# Create the directory if it doesn't exist
-if not os.path.exists(cache_dir):
-    os.makedirs(cache_dir)
-# Set the environment variable
-os.environ['HF_HOME'] = cache_dir
-# Verify the environment variable is set
-print(f"HF_HOME is set to: {os.environ['HF_HOME']}")
 class Config: E, H, N, C, B, M, S, V, W, L, D = 512, 32, 1024, 256, 128, 20000, 2048, 1e5, 4000, 2e-4, .15
 class MyDataset(Dataset):
@@ -90,7 +76,7 @@ def create_model_from_folder(folder_path):
                 s[os.path.basename(r).replace('.', '_')].extend(parse_xml(os.path.join(r, file)))
     return DM(dict(s))
-def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbert-large-1.7M-1"):
     t, m, embeddings, ds = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name), [], []
     for r, d, f in os.walk(folder_path):
         for file in f:
@@ -99,15 +85,15 @@ def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbe
                 for e in root.iter():
                     if e.text:
                         text = e.text.strip()
-                        i = t(text, return_tensors="pt", truncation=True, padding=True)
                         with torch.no_grad():
                             embeddings.append(m(**i).last_hidden_state.mean(dim=1).numpy())
                         ds.append(text)
     return np.vstack(embeddings), ds
-def query_vector_similarity(query, embeddings, ds, model_name="pile-of-law/legalbert-large-1.7M-2"):
     t, m = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name)
-    i = t(query, return_tensors="pt", truncation=True, padding=True)
     with torch.no_grad():
         qe = m(**i).last_hidden_state.mean(dim=1).numpy()
     return [ds[i] for i in cosine_similarity(qe, embeddings)[0].argsort()[-5:][::-1]]
@@ -122,7 +108,7 @@ def fetch_courtlistener_data(query):
         return []
 def main():
-    folder_path, model = 'data', create_model_from_folder('data')
     logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
     embeddings, ds = create_embeddings_and_sentences(folder_path)
     accelerator, optimizer, criterion, num_epochs = Accelerator(), torch.optim.Adam(model.parameters(), lr=0.001), nn.CrossEntropyLoss(), 10

 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 class Config: E, H, N, C, B, M, S, V, W, L, D = 512, 32, 1024, 256, 128, 20000, 2048, 1e5, 4000, 2e-4, .15
 class MyDataset(Dataset):
                 s[os.path.basename(r).replace('.', '_')].extend(parse_xml(os.path.join(r, file)))
     return DM(dict(s))
+def create_embeddings_and_sentences(folder_path, model_name="pile-of-law/legalbert-large-1.7M-1", max_length=512):
     t, m, embeddings, ds = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name), [], []
     for r, d, f in os.walk(folder_path):
         for file in f:
                 for e in root.iter():
                     if e.text:
                         text = e.text.strip()
+                        i = t(text, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
                         with torch.no_grad():
                             embeddings.append(m(**i).last_hidden_state.mean(dim=1).numpy())
                         ds.append(text)
     return np.vstack(embeddings), ds
+def query_vector_similarity(query, embeddings, ds, model_name="pile-of-law/legalbert-large-1.7M-2", max_length=512):
     t, m = AutoTokenizer.from_pretrained(model_name), AutoModel.from_pretrained(model_name)
+    i = t(query, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
     with torch.no_grad():
         qe = m(**i).last_hidden_state.mean(dim=1).numpy()
     return [ds[i] for i in cosine_similarity(qe, embeddings)[0].argsort()[-5:][::-1]]
         return []
 def main():
+    folder_path, model = 'data', create_model_from_folder('Xml_Data')
     logging.info(f"Created dynamic PyTorch model with sections: {list(model.s.keys())}")
     embeddings, ds = create_embeddings_and_sentences(folder_path)
     accelerator, optimizer, criterion, num_epochs = Accelerator(), torch.optim.Adam(model.parameters(), lr=0.001), nn.CrossEntropyLoss(), 10