Spaces:

ParitKansal
/

tempAutoScraping

Sleeping

App Files Files Community

tempAutoScraping / llm.py

ParitKansal

Add all files

f96e5ac about 1 year ago

raw

history blame contribute delete

4.36 kB

	import os
	import re
	import json
	from groq import Groq

	def process_and_save_json(input_file_path, output_file_path, api_key, chunk_size=2048, overlap_size=256, model="llama3-8b-8192", about='Events', details=['name', 'details']):
	"""
	Processes the input file in overlapping chunks, interacts with the model, and saves the JSON output to a file.

	Parameters:
	input_file_path (str): Path to the input file containing data.
	output_file_path (str): Path to the output JSON file.
	api_key (str): Groq API key for authentication.
	chunk_size (int): Size of each chunk of data.
	overlap_size (int): Number of characters to overlap between chunks.
	model (str): Model identifier to use for processing.
	about (str): Description of the data for the model.
	details (list or str): List of column names expected in the output JSON or a specific condition.
	"""
	# Initialize the Groq client with the provided API key
	client = Groq(api_key=api_key)

	def read_file_in_chunks(file_path, chunk_size, overlap_size):
	"""Reads the file in chunks of a specified size with overlapping."""
	with open(file_path, 'r', encoding='utf-8') as f:
	buffer = f.read()
	start = 0
	while start < len(buffer):
	end = start + chunk_size
	yield buffer[start:end]
	start = end - overlap_size # Move the start point for the next chunk

	def extract_text_between_braces(text):
	"""Extracts and returns all text between curly braces."""
	matches = re.findall(r'\{.*?\}', text, re.DOTALL)
	return matches

	def ensure_strings_in_json(data):
	"""Ensure that all values in JSON are strings."""
	if isinstance(data, dict):
	return {k: str(v) if not isinstance(v, (dict, list)) else ensure_strings_in_json(v) for k, v in data.items()}
	elif isinstance(data, list):
	return [ensure_strings_in_json(item) for item in data]
	return str(data)

	def process_chunk(client, chunk, model, about, details):
	"""Sends a chunk to the model and returns the completion."""

	system_message = (
	f"You are a helpful assistant for cleaning and organizing the data. \n"
	f"This data is about {about}. \n"
	f"Output should be well organized in the form of JSON with the following columns: {', '.join(details)}.\n"
	f"Do not add extra details apart from JSON.\n"
	f"If there is no such data, return an empty list.\n"
	)

	completion = client.chat.completions.create(
	model=model,
	messages=[
	{
	"role": "system",
	"content": system_message
	},
	{
	"role": "user",
	"content": chunk
	}
	],
	temperature=1,
	max_tokens=8192,
	top_p=1,
	stream=False,
	stop=None,
	)

	# Accessing the message content using dot notation
	return completion.choices[0].message.content

	combined_output = []

	# Read and process the file in chunks
	for chunk in read_file_in_chunks(input_file_path, chunk_size, overlap_size):
	output = process_chunk(client, chunk, model, about, details)
	# Extract all text between curly braces from each chunk output
	brace_texts = extract_text_between_braces(output)
	for brace_text in brace_texts:
	try:
	# Parse JSON and ensure all values are strings
	json_data = json.loads(brace_text)
	json_data = ensure_strings_in_json(json_data)
	combined_output.append(json_data)
	except json.JSONDecodeError:
	print("+++++++++++++++++++++++++++++++++++++++++++++++++")
	print("Invalid JSON format in extracted text:")
	print(brace_text)
	print("+++++++++++++++++++++++++++++++++++++++++++++++++")

	# Output the combined result to a JSON file
	with open(output_file_path, 'w', encoding='utf-8') as f:
	json.dump(combined_output, f, indent=4)

	print(f"Processing complete. Output saved to '{output_file_path}'.")