Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import json | |
| from groq import Groq | |
| def process_and_save_json(input_file_path, output_file_path, api_key, chunk_size=2048, overlap_size=256, model="llama3-8b-8192", about='Events', details=['name', 'details']): | |
| """ | |
| Processes the input file in overlapping chunks, interacts with the model, and saves the JSON output to a file. | |
| Parameters: | |
| input_file_path (str): Path to the input file containing data. | |
| output_file_path (str): Path to the output JSON file. | |
| api_key (str): Groq API key for authentication. | |
| chunk_size (int): Size of each chunk of data. | |
| overlap_size (int): Number of characters to overlap between chunks. | |
| model (str): Model identifier to use for processing. | |
| about (str): Description of the data for the model. | |
| details (list or str): List of column names expected in the output JSON or a specific condition. | |
| """ | |
| # Initialize the Groq client with the provided API key | |
| client = Groq(api_key=api_key) | |
| def read_file_in_chunks(file_path, chunk_size, overlap_size): | |
| """Reads the file in chunks of a specified size with overlapping.""" | |
| with open(file_path, 'r', encoding='utf-8') as f: | |
| buffer = f.read() | |
| start = 0 | |
| while start < len(buffer): | |
| end = start + chunk_size | |
| yield buffer[start:end] | |
| start = end - overlap_size # Move the start point for the next chunk | |
| def extract_text_between_braces(text): | |
| """Extracts and returns all text between curly braces.""" | |
| matches = re.findall(r'\{.*?\}', text, re.DOTALL) | |
| return matches | |
| def ensure_strings_in_json(data): | |
| """Ensure that all values in JSON are strings.""" | |
| if isinstance(data, dict): | |
| return {k: str(v) if not isinstance(v, (dict, list)) else ensure_strings_in_json(v) for k, v in data.items()} | |
| elif isinstance(data, list): | |
| return [ensure_strings_in_json(item) for item in data] | |
| return str(data) | |
| def process_chunk(client, chunk, model, about, details): | |
| """Sends a chunk to the model and returns the completion.""" | |
| system_message = ( | |
| f"You are a helpful assistant for cleaning and organizing the data. \n" | |
| f"This data is about {about}. \n" | |
| f"Output should be well organized in the form of JSON with the following columns: {', '.join(details)}.\n" | |
| f"Do not add extra details apart from JSON.\n" | |
| f"If there is no such data, return an empty list.\n" | |
| ) | |
| completion = client.chat.completions.create( | |
| model=model, | |
| messages=[ | |
| { | |
| "role": "system", | |
| "content": system_message | |
| }, | |
| { | |
| "role": "user", | |
| "content": chunk | |
| } | |
| ], | |
| temperature=1, | |
| max_tokens=8192, | |
| top_p=1, | |
| stream=False, | |
| stop=None, | |
| ) | |
| # Accessing the message content using dot notation | |
| return completion.choices[0].message.content | |
| combined_output = [] | |
| # Read and process the file in chunks | |
| for chunk in read_file_in_chunks(input_file_path, chunk_size, overlap_size): | |
| output = process_chunk(client, chunk, model, about, details) | |
| # Extract all text between curly braces from each chunk output | |
| brace_texts = extract_text_between_braces(output) | |
| for brace_text in brace_texts: | |
| try: | |
| # Parse JSON and ensure all values are strings | |
| json_data = json.loads(brace_text) | |
| json_data = ensure_strings_in_json(json_data) | |
| combined_output.append(json_data) | |
| except json.JSONDecodeError: | |
| print("+++++++++++++++++++++++++++++++++++++++++++++++++") | |
| print("Invalid JSON format in extracted text:") | |
| print(brace_text) | |
| print("+++++++++++++++++++++++++++++++++++++++++++++++++") | |
| # Output the combined result to a JSON file | |
| with open(output_file_path, 'w', encoding='utf-8') as f: | |
| json.dump(combined_output, f, indent=4) | |
| print(f"Processing complete. Output saved to '{output_file_path}'.") | |