Spaces:
Build error
Build error
| """ | |
| Sep 2024 | |
| Ryan Tietjen | |
| Contains helper functions to process user input for the demo | |
| """ | |
| import pandas as pd | |
| def split_abstract(abstract): | |
| results = [] | |
| lines = abstract.split("\n") | |
| for i, line in enumerate(lines): | |
| entry = { | |
| "target": 0, | |
| "text": line.lower(), | |
| "line_number": i + 1, | |
| "total_lines": len(lines) | |
| } | |
| results.append(entry) | |
| return results | |
| def split_abstract_original(abstract): | |
| results = [] | |
| lines = abstract.split("\n") | |
| for i, line in enumerate(lines): | |
| entry = { | |
| "target": 0, | |
| "text": line, | |
| "line_number": i + 1, | |
| "total_lines": len(lines) | |
| } | |
| results.append(entry) | |
| return results | |
| def split_sentences_by_characters(corpus): | |
| return [" ".join(sentence) for sentence in corpus] | |
| def encode_labels(*datasets): | |
| """ | |
| Encode labels for multiple datasets using a unified label mapping. | |
| Args: | |
| *datasets: Arbitrary number of array-like structures containing labels. | |
| Returns: | |
| tuple: Encoded labels as numpy arrays for each dataset. | |
| """ | |
| # Collect all labels from all datasets into a single list | |
| all_labels = pd.concat([pd.Series(data) for data in datasets]) | |
| # Get unique labels and sort them to ensure consistency | |
| unique_labels = pd.unique(all_labels) | |
| unique_labels.sort() | |
| # Create mapping from labels to integers | |
| label_to_index = {label: idx for idx, label in enumerate(unique_labels)} | |
| # Function to encode a single dataset | |
| def encode_single_dataset(dataset, mapping): | |
| return pd.Series(dataset).map(mapping).to_numpy() | |
| # Encode all datasets using the mapping | |
| encoded_datasets = tuple(encode_single_dataset(dataset, label_to_index) for dataset in datasets) | |
| # Return only the encoded datasets | |
| return encoded_datasets |