Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import torch | |
| from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM | |
| from bs4 import BeautifulSoup, NavigableString, Tag | |
| import re | |
| import time | |
| import random | |
| import nltk | |
| from nltk.tokenize import sent_tokenize | |
| # Download required NLTK data | |
| try: | |
| nltk.download('punkt', quiet=True) | |
| except: | |
| pass | |
| # Try to import spaCy but make it optional | |
| try: | |
| import spacy | |
| SPACY_AVAILABLE = True | |
| except: | |
| print("spaCy not available, using NLTK for sentence processing") | |
| SPACY_AVAILABLE = False | |
| class HumanLikeVariations: | |
| """Add human-like variations and intentional imperfections""" | |
| def __init__(self): | |
| # Common human writing patterns - EXPANDED for Originality AI | |
| self.casual_transitions = [ | |
| "So, ", "Well, ", "Now, ", "Actually, ", "Basically, ", | |
| "You know, ", "I mean, ", "Thing is, ", "Honestly, ", | |
| "Look, ", "Listen, ", "See, ", "Okay, ", "Right, ", | |
| "Anyway, ", "Besides, ", "Plus, ", "Also, ", "Oh, ", | |
| "Hey, ", "Alright, ", "Sure, ", "Fine, ", "Obviously, ", | |
| "Clearly, ", "Seriously, ", "Literally, ", "Frankly, ", | |
| "To be honest, ", "Truth is, ", "In fact, ", "Believe it or not, ", | |
| "Here's the thing, ", "Let me tell you, ", "Get this, ", | |
| "Funny thing is, ", "Interestingly, ", "Surprisingly, ", | |
| "Let's be real here, ", "Can we talk about ", "Quick question: ", | |
| "Real talk: ", "Hot take: ", "Unpopular opinion: ", "Fun fact: ", | |
| "Pro tip: ", "Side note: ", "Random thought: ", "Food for thought: ", | |
| "Just saying, ", "Not gonna lie, ", "For what it's worth, ", | |
| "If you ask me, ", "Between you and me, ", "Here's my take: ", | |
| "Let's face it, ", "No kidding, ", "Seriously though, ", | |
| "But wait, ", "Hold on, ", "Check this out: ", "Guess what? " | |
| ] | |
| self.filler_phrases = [ | |
| "kind of", "sort of", "pretty much", "basically", "actually", | |
| "really", "just", "quite", "rather", "fairly", "totally", | |
| "definitely", "probably", "maybe", "perhaps", "somehow", | |
| "somewhat", "literally", "seriously", "honestly", "frankly", | |
| "simply", "merely", "purely", "truly", "genuinely", | |
| "absolutely", "completely", "entirely", "utterly", "practically", | |
| "virtually", "essentially", "fundamentally", "generally", "typically", | |
| "usually", "normally", "often", "sometimes", "occasionally", | |
| "apparently", "evidently", "obviously", "clearly", "seemingly", | |
| "arguably", "potentially", "possibly", "likely", "unlikely", | |
| "more or less", "give or take", "so to speak", "if you will", | |
| "per se", "as such", "in a way", "to some extent", "to a degree", | |
| "I kid you not", "no joke", "for real", "not gonna lie", | |
| "I'm telling you", "trust me", "believe me", "I swear", | |
| "hands down", "without a doubt", "100%", "straight up", | |
| "I think", "I feel like", "I guess", "I suppose", "seems like", | |
| "appears to be", "might be", "could be", "tends to", "tends to be", | |
| "in my experience", "from what I've seen", "as far as I know", | |
| "to the best of my knowledge", "if I'm not mistaken", "correct me if I'm wrong", | |
| "you know what", "here's the deal", "bottom line", "at any rate", | |
| "all in all", "when you think about it", "come to think of it", | |
| "now that I think about it", "if we're being honest", "to be fair" | |
| ] | |
| self.human_connectors = [ | |
| ", which means", ", so", ", because", ", since", ", although", | |
| ". That's why", ". This means", ". So basically,", ". The thing is,", | |
| ", and honestly", ", but here's the thing", ", though", ", however", | |
| ". Plus,", ". Also,", ". Besides,", ". Moreover,", ". Furthermore,", | |
| ", which is why", ", and that's because", ", given that", ", considering", | |
| ". In other words,", ". Put simply,", ". To clarify,", ". That said,", | |
| ", you see", ", you know", ", right?", ", okay?", ", yeah?", | |
| ". Here's why:", ". Let me explain:", ". Think about it:", | |
| ", if you ask me", ", in my opinion", ", from my perspective", | |
| ". On the flip side,", ". On the other hand,", ". Conversely,", | |
| ", not to mention", ", let alone", ", much less", ", aside from", | |
| ". What's more,", ". Even better,", ". Even worse,", ". The catch is,", | |
| ", believe it or not", ", surprisingly enough", ", interestingly enough", | |
| ". Long story short,", ". Bottom line is,", ". Point being,", | |
| ", as you might expect", ", as it turns out", ", as luck would have it", | |
| ". And get this:", ". But wait, there's more:", ". Here's the kicker:", | |
| ", and here's why", ", and here's the thing", ", but here's what happened", | |
| ". Spoiler alert:", ". Plot twist:", ". Reality check:", | |
| ", at the end of the day", ", when all is said and done", ", all things considered", | |
| ". Make no mistake,", ". Don't get me wrong,", ". Let's not forget,", | |
| ", between you and me", ", off the record", ", just between us", | |
| ". And honestly?", ". But seriously,", ". And you know what?", | |
| ", which brings me to", ". This reminds me of", ", speaking of which", | |
| ". Funny enough,", ". Weird thing is,", ". Strange but true:", | |
| ", and I mean", ". I'm not kidding when I say", ", and trust me on this" | |
| ] | |
| # NEW: Common human typos and variations | |
| self.common_typos = { | |
| "the": ["teh", "th", "hte"], | |
| "and": ["adn", "nad", "an"], | |
| "that": ["taht", "htat", "tha"], | |
| "with": ["wiht", "wtih", "iwth"], | |
| "have": ["ahve", "hvae", "hav"], | |
| "from": ["form", "fro", "frmo"], | |
| "they": ["tehy", "thye", "htey"], | |
| "which": ["whihc", "wich", "whcih"], | |
| "their": ["thier", "theri", "tehir"], | |
| "would": ["woudl", "wuold", "woul"], | |
| "there": ["tehre", "theer", "ther"], | |
| "could": ["coudl", "cuold", "coud"], | |
| "people": ["poeple", "peopel", "pepole"], | |
| "through": ["thorugh", "throught", "trhough"], | |
| "because": ["becuase", "becasue", "beacuse"], | |
| "before": ["beofre", "befroe", "befor"], | |
| "different": ["differnt", "differnet", "diferent"], | |
| "between": ["bewteen", "betwen", "betewen"], | |
| "important": ["improtant", "importnat", "importan"], | |
| "information": ["infromation", "informaiton", "informaton"] | |
| } | |
| # NEW: Human-like sentence starters for variety | |
| self.varied_starters = [ | |
| "When it comes to", "As for", "Regarding", "In terms of", | |
| "With respect to", "Concerning", "Speaking of", "About", | |
| "If we look at", "Looking at", "Considering", "Given", | |
| "Taking into account", "Bear in mind that", "Keep in mind", | |
| "It's worth noting that", "It should be noted that", | |
| "One thing to consider is", "An important point is", | |
| "What's interesting is", "What stands out is", | |
| "The key here is", "The main thing is", "The point is", | |
| "Here's what matters:", "Here's the deal:", "Here's something:", | |
| "Let's not forget", "We should remember", "Don't forget", | |
| "Think about it this way:", "Look at it like this:", | |
| "Consider this:", "Picture this:", "Imagine this:", | |
| "You might wonder", "You might ask", "You may think", | |
| "Some people say", "Many believe", "It's often said", | |
| "Research shows", "Studies indicate", "Evidence suggests", | |
| "Experience tells us", "History shows", "Time has shown" | |
| ] | |
| def add_human_touch(self, text): | |
| """Add subtle human-like imperfections - NATURAL PATTERNS ONLY""" | |
| sentences = text.split('. ') | |
| modified_sentences = [] | |
| # Track what we've used to avoid patterns | |
| used_transitions = [] | |
| for i, sent in enumerate(sentences): | |
| if not sent.strip(): | |
| continue | |
| # Always use contractions where natural | |
| sent = self.apply_contractions(sent) | |
| # Add VERY occasional natural errors (5% chance) | |
| if random.random() < 0.05 and len(sent.split()) > 15: | |
| error_types = [ | |
| # Missing comma in compound sentence | |
| lambda s: s.replace(", and", " and", 1) if ", and" in s else s, | |
| # Wrong homophone | |
| lambda s: s.replace("their", "there", 1) if "their" in s and random.random() < 0.3 else s, | |
| # Missing apostrophe | |
| lambda s: s.replace("it's", "its", 1) if "it's" in s and random.random() < 0.3 else s, | |
| ] | |
| error_func = random.choice(error_types) | |
| sent = error_func(sent) | |
| modified_sentences.append(sent) | |
| return '. '.join(modified_sentences) | |
| def apply_contractions(self, text): | |
| """Apply common contractions - EXPANDED""" | |
| contractions = { | |
| "it is": "it's", "that is": "that's", "there is": "there's", | |
| "he is": "he's", "she is": "she's", "what is": "what's", | |
| "where is": "where's", "who is": "who's", "how is": "how's", | |
| "cannot": "can't", "will not": "won't", "do not": "don't", | |
| "does not": "doesn't", "did not": "didn't", "could not": "couldn't", | |
| "should not": "shouldn't", "would not": "wouldn't", "is not": "isn't", | |
| "are not": "aren't", "was not": "wasn't", "were not": "weren't", | |
| "have not": "haven't", "has not": "hasn't", "had not": "hadn't", | |
| "I am": "I'm", "you are": "you're", "we are": "we're", | |
| "they are": "they're", "I have": "I've", "you have": "you've", | |
| "we have": "we've", "they have": "they've", "I will": "I'll", | |
| "you will": "you'll", "he will": "he'll", "she will": "she'll", | |
| "we will": "we'll", "they will": "they'll", "I would": "I'd", | |
| "you would": "you'd", "he would": "he'd", "she would": "she'd", | |
| "we would": "we'd", "they would": "they'd", "could have": "could've", | |
| "should have": "should've", "would have": "would've", "might have": "might've", | |
| "must have": "must've", "there has": "there's", "here is": "here's", | |
| "let us": "let's", "that will": "that'll", "who will": "who'll" | |
| } | |
| for full, contr in contractions.items(): | |
| if random.random() < 0.8: # 80% chance to apply each contraction | |
| text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE) | |
| return text | |
| def add_minor_errors(self, text): | |
| """Add very minor, human-like errors - MORE REALISTIC BUT CONTROLLED""" | |
| # Occasionally miss Oxford comma (15% chance) | |
| if random.random() < 0.15: | |
| # Only in lists, not random commas | |
| text = re.sub(r'(\w+), (\w+), and (\w+)', r'\1, \2 and \3', text) | |
| # Sometimes use 'which' instead of 'that' (8% chance) | |
| if random.random() < 0.08: | |
| # Only for non-restrictive clauses | |
| matches = re.finditer(r'\b(\w+) that (\w+)', text) | |
| for match in list(matches)[:1]: # Only first occurrence | |
| if match.group(1).lower() not in ['believe', 'think', 'know', 'say']: | |
| text = text.replace(match.group(0), f"{match.group(1)} which {match.group(2)}", 1) | |
| # NEW: Add very occasional typos (2% chance per sentence) - REDUCED AND CONTROLLED | |
| sentences = text.split('. ') | |
| for i, sent in enumerate(sentences): | |
| if random.random() < 0.02 and len(sent.split()) > 15: # Only in longer sentences | |
| words = sent.split() | |
| # Pick a random word to potentially typo | |
| word_idx = random.randint(len(words)//2, len(words)-2) # Avoid start/end | |
| word = words[word_idx].lower() | |
| # Only typo common words where typo won't break meaning | |
| safe_typos = { | |
| 'the': 'teh', | |
| 'and': 'adn', | |
| 'that': 'taht', | |
| 'with': 'wtih', | |
| 'from': 'form', | |
| 'because': 'becuase' | |
| } | |
| if word in safe_typos and random.random() < 0.5: | |
| typo = safe_typos[word] | |
| # Preserve original capitalization | |
| if words[word_idx][0].isupper(): | |
| typo = typo[0].upper() + typo[1:] | |
| words[word_idx] = typo | |
| sentences[i] = ' '.join(words) | |
| text = '. '.join(sentences) | |
| # Skip double words - too distracting | |
| # Mix up common homophones occasionally (2% chance) - ONLY SAFE ONES | |
| if random.random() < 0.02: | |
| safe_homophones = [ | |
| ('its', "it's"), # Very common mistake | |
| ('your', "you're"), # Another common one | |
| ] | |
| for pair in safe_homophones: | |
| # Check context to avoid breaking meaning | |
| if f" {pair[0]} " in text and random.random() < 0.3: | |
| # Find one instance and check it's safe to replace | |
| pattern = rf'\b{pair[0]}\s+(\w+ing|\w+ed)\b' # its + verb = likely should be it's | |
| if re.search(pattern, text): | |
| text = re.sub(pattern, f"{pair[1]} \\1", text, count=1) | |
| break | |
| return text | |
| def add_natural_human_patterns(self, text): | |
| """Add natural human writing patterns that Originality AI associates with human text""" | |
| sentences = self.split_into_sentences_advanced(text) | |
| result_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| if not sentence.strip(): | |
| continue | |
| # Natural contractions throughout | |
| sentence = self.apply_contractions(sentence) | |
| # Add natural speech patterns (15% chance) | |
| if random.random() < 0.15 and len(sentence.split()) > 10: | |
| # Natural interruptions that humans actually use | |
| if random.random() < 0.5: | |
| # Add "you know" or "I mean" naturally | |
| words = sentence.split() | |
| if len(words) > 6: | |
| pos = random.randint(3, len(words)-3) | |
| if random.random() < 0.5: | |
| words.insert(pos, "you know,") | |
| else: | |
| words.insert(pos, "I mean,") | |
| sentence = ' '.join(words) | |
| else: | |
| # Start with natural opener | |
| openers = ["Look,", "See,", "Thing is,", "Honestly,", "Actually,"] | |
| sentence = random.choice(openers) + " " + sentence[0].lower() + sentence[1:] | |
| # Add subtle errors that humans make (10% chance - reduced) | |
| if random.random() < 0.10: | |
| words = sentence.split() | |
| if len(words) > 5: | |
| # Common comma omissions | |
| if ", and" in sentence and random.random() < 0.3: | |
| sentence = sentence.replace(", and", " and", 1) | |
| # Double words occasionally | |
| elif random.random() < 0.2: | |
| idx = random.randint(1, len(words)-2) | |
| if words[idx].lower() in ['the', 'a', 'to', 'in', 'on', 'at']: | |
| words.insert(idx+1, words[idx]) | |
| sentence = ' '.join(words) | |
| # Natural sentence combinations (20% chance) | |
| if i < len(sentences) - 1 and random.random() < 0.2: | |
| next_sent = sentences[i+1].strip() | |
| if next_sent and len(sentence.split()) + len(next_sent.split()) < 25: | |
| # Natural connectors based on content | |
| if any(w in next_sent.lower() for w in ['but', 'however', 'although']): | |
| sentence = sentence.rstrip('.') + ", but " + next_sent[0].lower() + next_sent[1:] | |
| sentences[i+1] = "" # Mark as processed | |
| elif any(w in next_sent.lower() for w in ['also', 'too', 'as well']): | |
| sentence = sentence.rstrip('.') + " and " + next_sent[0].lower() + next_sent[1:] | |
| sentences[i+1] = "" # Mark as processed | |
| result_sentences.append(sentence) | |
| return ' '.join([s for s in result_sentences if s]) | |
| def vary_sentence_start(self, sentence): | |
| """Vary sentence beginning to avoid repetitive patterns""" | |
| if not sentence: | |
| return sentence | |
| words = sentence.split() | |
| if len(words) < 5: | |
| return sentence | |
| # Different ways to start sentences naturally | |
| variations = [ | |
| lambda s: "When " + s[0].lower() + s[1:] + ", it makes sense.", | |
| lambda s: "If you think about it, " + s[0].lower() + s[1:], | |
| lambda s: s + " This is important.", | |
| lambda s: "The thing about " + words[0].lower() + " " + ' '.join(words[1:]) + " is clear.", | |
| lambda s: "What's interesting is " + s[0].lower() + s[1:], | |
| lambda s: s, # Keep original sometimes | |
| ] | |
| # Pick a random variation | |
| variation = random.choice(variations) | |
| try: | |
| return variation(sentence) | |
| except: | |
| return sentence | |
| def split_into_sentences_advanced(self, text): | |
| """Advanced sentence splitting using spaCy or NLTK""" | |
| if SPACY_AVAILABLE: | |
| try: | |
| nlp = spacy.load("en_core_web_sm") | |
| doc = nlp(text) | |
| sentences = [sent.text.strip() for sent in doc.sents] | |
| except: | |
| sentences = sent_tokenize(text) | |
| else: | |
| # Fallback to NLTK | |
| try: | |
| sentences = sent_tokenize(text) | |
| except: | |
| # Final fallback to regex | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| # Clean up sentences | |
| return [s for s in sentences if s and len(s.strip()) > 0] | |
| class SelectiveGrammarFixer: | |
| """Minimal grammar fixes to maintain human-like quality while fixing critical errors""" | |
| def __init__(self): | |
| self.nlp = None | |
| self.human_variations = HumanLikeVariations() | |
| def fix_incomplete_sentences_only(self, text): | |
| """Fix only incomplete sentences without over-correcting""" | |
| if not text: | |
| return text | |
| sentences = text.split('. ') | |
| fixed_sentences = [] | |
| for i, sent in enumerate(sentences): | |
| sent = sent.strip() | |
| if not sent: | |
| continue | |
| # Only fix if sentence is incomplete | |
| if sent and sent[-1] not in '.!?': | |
| # Check if it's the last sentence | |
| if i == len(sentences) - 1: | |
| # Add period if it's clearly a statement | |
| if not sent.endswith(':') and not sent.endswith(','): | |
| sent += '.' | |
| else: | |
| # Middle sentences should have periods | |
| sent += '.' | |
| # Ensure first letter capitalization ONLY after sentence endings | |
| if i > 0 and sent and sent[0].islower(): | |
| # Check if previous sentence ended with punctuation | |
| if fixed_sentences and fixed_sentences[-1].rstrip().endswith(('.', '!', '?')): | |
| sent = sent[0].upper() + sent[1:] | |
| elif i == 0 and sent and sent[0].islower(): | |
| # First sentence should be capitalized | |
| sent = sent[0].upper() + sent[1:] | |
| fixed_sentences.append(sent) | |
| result = ' '.join(fixed_sentences) | |
| # Add natural human variations (but we need to reference the main class method) | |
| # This will be called from the smart_fix method instead | |
| return result | |
| def fix_basic_punctuation_errors(self, text): | |
| """Fix only the most egregious punctuation errors""" | |
| if not text: | |
| return text | |
| # Fix double spaces (human-like error) | |
| text = re.sub(r'\s{2,}', ' ', text) | |
| # Fix space before punctuation (common error) | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) | |
| # Fix missing space after punctuation (human-like) | |
| text = re.sub(r'([.,!?])([A-Z])', r'\1 \2', text) | |
| # Fix accidental double punctuation | |
| text = re.sub(r'([.!?])\1+', r'\1', text) | |
| # Fix "i" capitalization (common human error to fix) | |
| text = re.sub(r'\bi\b', 'I', text) | |
| return text | |
| def preserve_natural_variations(self, text): | |
| """Keep some natural human-like variations""" | |
| # Don't fix everything - leave some variety | |
| # Only fix if really broken | |
| if text.count('.') == 0 and len(text.split()) > 20: | |
| # Long text with no periods - needs fixing | |
| words = text.split() | |
| # Add periods every 15-25 words naturally (more variation) | |
| new_text = [] | |
| for i, word in enumerate(words): | |
| new_text.append(word) | |
| if i > 0 and i % random.randint(12, 25) == 0: | |
| if word[-1] not in '.!?,;:': | |
| new_text[-1] = word + '.' | |
| # Capitalize next word if it's not an acronym | |
| if i + 1 < len(words) and words[i + 1][0].islower(): | |
| # Check if it's not likely an acronym | |
| if not words[i + 1].isupper(): | |
| words[i + 1] = words[i + 1][0].upper() + words[i + 1][1:] | |
| text = ' '.join(new_text) | |
| return text | |
| def smart_fix(self, text): | |
| """Apply minimal fixes to maintain human-like quality""" | |
| # Apply fixes in order of importance | |
| text = self.fix_basic_punctuation_errors(text) | |
| text = self.fix_incomplete_sentences_only(text) | |
| text = self.preserve_natural_variations(text) | |
| return text | |
| class EnhancedDipperHumanizer: | |
| def __init__(self): | |
| self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| print(f"Using device: {self.device}") | |
| # Clear GPU cache | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| # Initialize grammar fixer | |
| self.grammar_fixer = SelectiveGrammarFixer() | |
| # Try to load spaCy if available | |
| self.nlp = None | |
| self.use_spacy = False | |
| if SPACY_AVAILABLE: | |
| try: | |
| self.nlp = spacy.load("en_core_web_sm") | |
| self.use_spacy = True | |
| print("spaCy loaded successfully") | |
| except: | |
| print("spaCy model not found, using NLTK for sentence splitting") | |
| try: | |
| # Load Dipper paraphraser WITHOUT 8-bit quantization for better performance | |
| print("Loading Dipper paraphraser model...") | |
| self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl') | |
| self.model = T5ForConditionalGeneration.from_pretrained( | |
| "kalpeshk2011/dipper-paraphraser-xxl", | |
| device_map="auto", # This will distribute across 4xL40S automatically | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True | |
| ) | |
| print("Dipper model loaded successfully!") | |
| self.is_dipper = True | |
| except Exception as e: | |
| print(f"Error loading Dipper model: {str(e)}") | |
| print("Falling back to Flan-T5-XL...") | |
| self.is_dipper = False | |
| # Fallback to Flan-T5-XL | |
| try: | |
| self.model = T5ForConditionalGeneration.from_pretrained( | |
| "google/flan-t5-xl", | |
| torch_dtype=torch.float16, | |
| low_cpu_mem_usage=True, | |
| device_map="auto" | |
| ) | |
| self.tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl") | |
| print("Loaded Flan-T5-XL as fallback") | |
| except: | |
| raise Exception("Could not load any model. Please check your system resources.") | |
| # Load BART as secondary model | |
| try: | |
| print("Loading BART model for additional variation...") | |
| self.bart_model = AutoModelForSeq2SeqLM.from_pretrained( | |
| "eugenesiow/bart-paraphrase", | |
| torch_dtype=torch.float16, | |
| device_map="auto" # Distribute across GPUs | |
| ) | |
| self.bart_tokenizer = AutoTokenizer.from_pretrained("eugenesiow/bart-paraphrase") | |
| self.use_bart = True | |
| print("BART model loaded successfully") | |
| except: | |
| print("BART model not available") | |
| self.use_bart = False | |
| # Initialize human variations handler | |
| self.human_variations = HumanLikeVariations() | |
| def add_natural_human_patterns(self, text): | |
| """Add natural human writing patterns that Originality AI associates with human text""" | |
| sentences = self.split_into_sentences_advanced(text) | |
| result_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| if not sentence.strip(): | |
| continue | |
| # Natural contractions throughout | |
| sentence = self.apply_contractions(sentence) | |
| # Add natural speech patterns (15% chance - balanced) | |
| if random.random() < 0.15 and len(sentence.split()) > 10: | |
| # Natural interruptions that humans actually use | |
| if random.random() < 0.5: | |
| # Add "you know" or "I mean" naturally | |
| words = sentence.split() | |
| if len(words) > 6: | |
| pos = random.randint(3, len(words)-3) | |
| if random.random() < 0.5: | |
| words.insert(pos, "you know,") | |
| else: | |
| words.insert(pos, "I mean,") | |
| sentence = ' '.join(words) | |
| else: | |
| # Start with natural opener | |
| openers = ["Look,", "See,", "Thing is,", "Honestly,", "Actually,"] | |
| sentence = random.choice(openers) + " " + sentence[0].lower() + sentence[1:] | |
| # Add subtle errors that humans make (8% chance) | |
| if random.random() < 0.08: | |
| words = sentence.split() | |
| if len(words) > 5: | |
| # Common comma omissions | |
| if ", and" in sentence and random.random() < 0.3: | |
| sentence = sentence.replace(", and", " and", 1) | |
| # Double words occasionally | |
| elif random.random() < 0.2: | |
| idx = random.randint(1, len(words)-2) | |
| if words[idx].lower() in ['the', 'a', 'to', 'in', 'on', 'at']: | |
| words.insert(idx+1, words[idx]) | |
| sentence = ' '.join(words) | |
| # Natural sentence combinations (20% chance) | |
| if i < len(sentences) - 1 and random.random() < 0.2: | |
| next_sent = sentences[i+1].strip() | |
| if next_sent and len(sentence.split()) + len(next_sent.split()) < 25: | |
| # Natural connectors based on content | |
| if any(w in next_sent.lower() for w in ['but', 'however', 'although']): | |
| sentence = sentence.rstrip('.') + ", but " + next_sent[0].lower() + next_sent[1:] | |
| sentences[i+1] = "" # Mark as processed | |
| elif any(w in next_sent.lower() for w in ['also', 'too', 'as well']): | |
| sentence = sentence.rstrip('.') + " and " + next_sent[0].lower() + next_sent[1:] | |
| sentences[i+1] = "" # Mark as processed | |
| result_sentences.append(sentence) | |
| return ' '.join([s for s in result_sentences if s]) | |
| def vary_sentence_start(self, sentence): | |
| """Vary sentence beginning to avoid repetitive patterns""" | |
| if not sentence: | |
| return sentence | |
| words = sentence.split() | |
| if len(words) < 5: | |
| return sentence | |
| # Different ways to start sentences naturally | |
| variations = [ | |
| lambda s: "When " + s[0].lower() + s[1:] + ", it makes sense.", | |
| lambda s: "If you think about it, " + s[0].lower() + s[1:], | |
| lambda s: s + " This is important.", | |
| lambda s: "The thing about " + words[0].lower() + " " + ' '.join(words[1:]) + " is clear.", | |
| lambda s: "What's interesting is " + s[0].lower() + s[1:], | |
| lambda s: s, # Keep original sometimes | |
| ] | |
| # Pick a random variation | |
| variation = random.choice(variations) | |
| try: | |
| return variation(sentence) | |
| except: | |
| return sentence | |
| def apply_contractions(self, text): | |
| """Apply common contractions to make text more natural""" | |
| contractions = { | |
| "it is": "it's", "that is": "that's", "there is": "there's", | |
| "he is": "he's", "she is": "she's", "what is": "what's", | |
| "where is": "where's", "who is": "who's", "how is": "how's", | |
| "cannot": "can't", "will not": "won't", "do not": "don't", | |
| "does not": "doesn't", "did not": "didn't", "could not": "couldn't", | |
| "should not": "shouldn't", "would not": "wouldn't", "is not": "isn't", | |
| "are not": "aren't", "was not": "wasn't", "were not": "weren't", | |
| "have not": "haven't", "has not": "hasn't", "had not": "hadn't", | |
| "I am": "I'm", "you are": "you're", "we are": "we're", | |
| "they are": "they're", "I have": "I've", "you have": "you've", | |
| "we have": "we've", "they have": "they've", "I will": "I'll", | |
| "you will": "you'll", "he will": "he'll", "she will": "she'll", | |
| "we will": "we'll", "they will": "they'll", "I would": "I'd", | |
| "you would": "you'd", "he would": "he'd", "she would": "she'd", | |
| "we would": "we'd", "they would": "they'd", "could have": "could've", | |
| "should have": "should've", "would have": "would've", "might have": "might've", | |
| "must have": "must've", "there has": "there's", "here is": "here's", | |
| "let us": "let's", "that will": "that'll", "who will": "who'll" | |
| } | |
| for full, contr in contractions.items(): | |
| text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE) | |
| return text | |
| def should_skip_element(self, element, text): | |
| """Determine if an element should be skipped from paraphrasing""" | |
| if not text or len(text.strip()) < 3: | |
| return True | |
| # Skip JavaScript code inside script tags | |
| parent = element.parent | |
| if parent and parent.name in ['script', 'style', 'noscript']: | |
| return True | |
| # Skip headings (h1-h6) | |
| if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']: | |
| return True | |
| # Skip content inside <strong> and <b> tags | |
| if parent and parent.name in ['strong', 'b']: | |
| return True | |
| # Skip table content | |
| if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)): | |
| return True | |
| # Special handling for content inside tables | |
| # Skip if it's inside strong/b/h1-h6 tags AND also inside a table | |
| if parent: | |
| # Check if we're inside a table | |
| is_in_table = any(p.name == 'table' for p in parent.parents) | |
| if is_in_table: | |
| # If we're in a table, skip any text that's inside formatting tags | |
| if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']: | |
| return True | |
| # Also check if parent's parent is a formatting tag | |
| if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']: | |
| return True | |
| # Skip table of contents | |
| if parent: | |
| parent_text = str(parent).lower() | |
| if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']): | |
| return True | |
| # Skip CTAs and buttons | |
| if parent and parent.name in ['button', 'a']: | |
| return True | |
| # Skip if parent has onclick or other event handlers | |
| if parent and parent.attrs: | |
| event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout'] | |
| if any(handler in parent.attrs for handler in event_handlers): | |
| return True | |
| # Special check for testimonial cards - check up to 3 levels of ancestors | |
| if parent: | |
| ancestors_to_check = [] | |
| current = parent | |
| for _ in range(3): # Check up to 3 levels up | |
| if current: | |
| ancestors_to_check.append(current) | |
| current = current.parent | |
| # Check if any ancestor has testimonial-card class | |
| for ancestor in ancestors_to_check: | |
| if ancestor and ancestor.get('class'): | |
| classes = ancestor.get('class', []) | |
| if isinstance(classes, list): | |
| if any('testimonial-card' in str(cls) for cls in classes): | |
| return True | |
| elif isinstance(classes, str) and 'testimonial-card' in classes: | |
| return True | |
| # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs | |
| skip_indicators = [ | |
| 'button', 'btn', 'heading', 'title', 'caption', | |
| 'toc-', 'contents', 'quiz', 'tip', 'note', 'alert', | |
| 'warning', 'info', 'success', 'error', 'code', 'pre', | |
| 'stats-grid', 'testimonial-card', | |
| 'cta-box', 'quiz-container', 'contact-form', | |
| 'faq-question', 'sidebar', 'widget', 'banner', | |
| 'author-intro', 'testimonial', 'review', 'feedback', | |
| 'floating-', 'stat-', 'progress-', 'option', 'results', | |
| 'question-container', 'quiz-', | |
| 'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown' | |
| ] | |
| # Check only immediate parent and grandparent (not all ancestors) | |
| elements_to_check = [parent] | |
| if parent and parent.parent: | |
| elements_to_check.append(parent.parent) | |
| for elem in elements_to_check: | |
| if not elem: | |
| continue | |
| # Check element's class | |
| elem_class = elem.get('class', []) | |
| if isinstance(elem_class, list): | |
| class_str = ' '.join(str(cls).lower() for cls in elem_class) | |
| if any(indicator in class_str for indicator in skip_indicators): | |
| return True | |
| # Check element's ID | |
| elem_id = elem.get('id', '') | |
| if any(indicator in str(elem_id).lower() for indicator in skip_indicators): | |
| return True | |
| # Skip short phrases that might be UI elements | |
| word_count = len(text.split()) | |
| if word_count <= 5: | |
| ui_patterns = [ | |
| 'click', 'download', 'learn more', 'read more', 'sign up', | |
| 'get started', 'try now', 'buy now', 'next', 'previous', | |
| 'back', 'continue', 'submit', 'cancel', 'get now', 'book your', | |
| 'check out:', 'see also:', 'related:', 'question', 'of' | |
| ] | |
| if any(pattern in text.lower() for pattern in ui_patterns): | |
| return True | |
| # Skip very short content in styled containers | |
| if parent and parent.name in ['div', 'section', 'aside', 'blockquote']: | |
| style = parent.get('style', '') | |
| if 'border' in style or 'background' in style: | |
| if word_count <= 20: | |
| # But don't skip if it's inside a paragraph | |
| if not any(p.name == 'p' for p in parent.parents): | |
| return True | |
| return False | |
| def is_likely_acronym_or_proper_noun(self, word): | |
| """Check if a word is likely an acronym or part of a proper noun""" | |
| # Common acronyms and abbreviations | |
| acronyms = {'MBA', 'CEO', 'USA', 'UK', 'GMAT', 'GRE', 'SAT', 'ACT', 'PhD', 'MD', 'IT', 'AI', 'ML'} | |
| # Check if it's in our acronym list | |
| if word.upper() in acronyms: | |
| return True | |
| # Check if it's all caps (likely acronym) | |
| if word.isupper() and len(word) > 1: | |
| return True | |
| # Check if it follows patterns like "Edition", "Focus", etc. that often come after proper nouns | |
| proper_noun_continuations = { | |
| 'Edition', 'Version', 'Series', 'Focus', 'System', 'Method', 'School', | |
| 'University', 'College', 'Institute', 'Academy', 'Center', 'Centre' | |
| } | |
| if word in proper_noun_continuations: | |
| return True | |
| return False | |
| def clean_model_output_enhanced(self, text): | |
| """Enhanced cleaning that preserves more natural structure""" | |
| if not text: | |
| return "" | |
| # Store original for fallback | |
| original = text | |
| # Remove ONLY clear model artifacts | |
| text = re.sub(r'^lexical\s*=\s*\d+\s*,\s*order\s*=\s*\d+\s*', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'<sent>\s*', '', text, flags=re.IGNORECASE) | |
| text = re.sub(r'\s*</sent>', '', text, flags=re.IGNORECASE) | |
| # Only remove clear prefixes | |
| if text.lower().startswith('paraphrase:'): | |
| text = text[11:].strip() | |
| elif text.lower().startswith('rewrite:'): | |
| text = text[8:].strip() | |
| # Clean up backticks and weird punctuation | |
| text = re.sub(r'``+', '', text) | |
| text = re.sub(r"''", '"', text) | |
| # Remove awkward phrase markers | |
| text = re.sub(r'- actually, scratch that -', '', text) | |
| text = re.sub(r'- wait, let me back up -', '', text) | |
| text = re.sub(r'- you know what I mean\? -', '', text) | |
| text = re.sub(r'- okay, here\'s the thing -', '', text) | |
| text = re.sub(r'- bear with me here -', '', text) | |
| text = re.sub(r'- I\'m serious -', '', text) | |
| text = re.sub(r'- or maybe I should say -', '', text) | |
| text = re.sub(r'- or rather,', '', text) | |
| text = re.sub(r'- think about it -', '', text) | |
| # Clean up multiple spaces | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove leading non-letter characters carefully | |
| text = re.sub(r'^[^a-zA-Z_]+', '', text) | |
| # If we accidentally removed too much, use original | |
| if len(text) < len(original) * 0.5: | |
| text = original | |
| return text.strip() | |
| def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20): | |
| """Paraphrase text using Dipper model with sentence-level processing""" | |
| if not text or len(text.strip()) < 3: | |
| return text | |
| # Split into sentences for better control | |
| sentences = self.split_into_sentences_advanced(text) | |
| paraphrased_sentences = [] | |
| # Track sentence patterns to avoid repetition | |
| sentence_starts = [] | |
| for i, sentence in enumerate(sentences): | |
| if len(sentence.strip()) < 3: | |
| paraphrased_sentences.append(sentence) | |
| continue | |
| try: | |
| # BALANCED diversity for Originality AI (100% human with better quality) | |
| if len(sentence.split()) < 10: | |
| lex_diversity = 70 # High but not extreme | |
| order_diversity = 25 | |
| else: | |
| lex_diversity = 82 # Balanced diversity | |
| order_diversity = 30 # Moderate order diversity | |
| lex_code = int(100 - lex_diversity) | |
| order_code = int(100 - order_diversity) | |
| # Format input for Dipper | |
| if self.is_dipper: | |
| input_text = f"lexical = {lex_code}, order = {order_code} <sent> {sentence} </sent>" | |
| else: | |
| input_text = f"paraphrase: {sentence}" | |
| # Tokenize | |
| inputs = self.tokenizer( | |
| input_text, | |
| return_tensors="pt", | |
| max_length=512, | |
| truncation=True, | |
| padding=True | |
| ) | |
| # Move to device | |
| if hasattr(self.model, 'device_map') and self.model.device_map: | |
| device = next(iter(self.model.device_map.values())) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| else: | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| # Generate with appropriate variation | |
| original_length = len(sentence.split()) | |
| max_new_length = int(original_length * 1.4) | |
| # High variation parameters | |
| temp = 0.85 # Slightly reduced from 0.9 | |
| top_p_val = 0.92 # Slightly reduced from 0.95 | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| **inputs, | |
| max_length=max_new_length + 20, | |
| min_length=max(5, int(original_length * 0.7)), | |
| do_sample=True, | |
| top_p=top_p_val, | |
| temperature=temp, | |
| no_repeat_ngram_size=4, # Allow more repetition for naturalness | |
| num_beams=1, # Greedy for more randomness | |
| early_stopping=True | |
| ) | |
| # Decode | |
| paraphrased = self.tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Clean model artifacts | |
| paraphrased = self.clean_model_output_enhanced(paraphrased) | |
| # Fix incomplete sentences | |
| paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence) | |
| # Ensure variety in sentence starts | |
| first_words = paraphrased.split()[:2] if paraphrased.split() else [] | |
| if first_words and i > 0: | |
| # Check if we're repeating patterns | |
| first_phrase = ' '.join(first_words).lower() | |
| if sentence_starts.count(first_phrase) >= 2: | |
| # Try to rephrase the beginning | |
| paraphrased = self.vary_sentence_start(paraphrased) | |
| sentence_starts.append(first_phrase) | |
| # Ensure reasonable length | |
| if len(paraphrased.split()) > max_new_length: | |
| paraphrased = ' '.join(paraphrased.split()[:max_new_length]) | |
| paraphrased_sentences.append(paraphrased) | |
| except Exception as e: | |
| print(f"Error paraphrasing sentence: {str(e)}") | |
| paraphrased_sentences.append(sentence) | |
| # Join sentences back | |
| result = ' '.join(paraphrased_sentences) | |
| # Apply natural human patterns | |
| result = self.add_natural_human_patterns(result) | |
| return result | |
| def fix_incomplete_sentence_smart(self, generated, original): | |
| """Smarter sentence completion that maintains natural flow""" | |
| if not generated or not generated.strip(): | |
| return original | |
| generated = generated.strip() | |
| # Check if the sentence seems complete semantically | |
| words = generated.split() | |
| if len(words) >= 3: | |
| # Check if last word is a good ending word | |
| last_word = words[-1].lower().rstrip('.,!?;:') | |
| # Common ending words that might not need punctuation fix | |
| ending_words = { | |
| 'too', 'also', 'well', 'though', 'however', | |
| 'furthermore', 'moreover', 'indeed', 'anyway', | |
| 'regardless', 'nonetheless', 'therefore', 'thus' | |
| } | |
| # If it ends with a good word, just add appropriate punctuation | |
| if last_word in ending_words: | |
| if generated[-1] not in '.!?': | |
| generated += '.' | |
| return generated | |
| # Check for cut-off patterns | |
| if len(words) > 0: | |
| last_word = words[-1] | |
| # Remove if it's clearly cut off (1-2 chars, no vowels) | |
| # But don't remove valid short words like "is", "of", "to", etc. | |
| short_valid_words = {'is', 'of', 'to', 'in', 'on', 'at', 'by', 'or', 'if', 'so', 'up', 'no', 'we', 'he', 'me', 'be', 'do', 'go'} | |
| if (len(last_word) <= 2 and | |
| last_word.lower() not in short_valid_words and | |
| not any(c in 'aeiouAEIOU' for c in last_word)): | |
| words = words[:-1] | |
| generated = ' '.join(words) | |
| # Add ending punctuation based on context | |
| if generated and generated[-1] not in '.!?:,;': | |
| # Check original ending | |
| orig_stripped = original.strip() | |
| if orig_stripped.endswith('?'): | |
| # Check if generated seems like a question | |
| question_words = ['what', 'why', 'how', 'when', 'where', 'who', 'which', 'is', 'are', 'do', 'does', 'can', 'could', 'would', 'should'] | |
| first_word = generated.split()[0].lower() if generated.split() else '' | |
| if first_word in question_words: | |
| generated += '?' | |
| else: | |
| generated += '.' | |
| elif orig_stripped.endswith('!'): | |
| # Check if generated seems exclamatory | |
| exclaim_words = ['amazing', 'incredible', 'fantastic', 'terrible', 'awful', 'wonderful', 'excellent'] | |
| if any(word in generated.lower() for word in exclaim_words): | |
| generated += '!' | |
| else: | |
| generated += '.' | |
| elif orig_stripped.endswith(':'): | |
| generated += ':' | |
| else: | |
| generated += '.' | |
| # Ensure first letter is capitalized ONLY if it's sentence start | |
| # Don't capitalize words like "iPhone" or "eBay" | |
| if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]): | |
| generated = generated[0].upper() + generated[1:] | |
| return generated | |
| def split_into_sentences_advanced(self, text): | |
| """Advanced sentence splitting using spaCy or NLTK""" | |
| if self.use_spacy and self.nlp: | |
| doc = self.nlp(text) | |
| sentences = [sent.text.strip() for sent in doc.sents] | |
| else: | |
| # Fallback to NLTK | |
| try: | |
| sentences = sent_tokenize(text) | |
| except: | |
| # Final fallback to regex | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| # Clean up sentences | |
| return [s for s in sentences if s and len(s.strip()) > 0] | |
| def paraphrase_with_bart(self, text): | |
| """Additional paraphrasing with BART for more variation""" | |
| if not self.use_bart or not text or len(text.strip()) < 3: | |
| return text | |
| try: | |
| # Process in smaller chunks for BART | |
| sentences = self.split_into_sentences_advanced(text) | |
| paraphrased_sentences = [] | |
| for sentence in sentences: | |
| if len(sentence.split()) < 5: | |
| paraphrased_sentences.append(sentence) | |
| continue | |
| inputs = self.bart_tokenizer( | |
| sentence, | |
| return_tensors='pt', | |
| max_length=128, | |
| truncation=True | |
| ) | |
| # Move to appropriate device | |
| if hasattr(self.bart_model, 'device_map') and self.bart_model.device_map: | |
| device = next(iter(self.bart_model.device_map.values())) | |
| inputs = {k: v.to(device) for k, v in inputs.items()} | |
| else: | |
| inputs = {k: v.to(self.device) for k, v in inputs.items()} | |
| original_length = len(sentence.split()) | |
| with torch.no_grad(): | |
| outputs = self.bart_model.generate( | |
| **inputs, | |
| max_length=int(original_length * 1.4) + 10, | |
| min_length=max(5, int(original_length * 0.6)), | |
| num_beams=2, | |
| temperature=1.1, # Higher temperature | |
| do_sample=True, | |
| top_p=0.9, | |
| early_stopping=True | |
| ) | |
| paraphrased = self.bart_tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Fix incomplete sentences | |
| paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence) | |
| paraphrased_sentences.append(paraphrased) | |
| result = ' '.join(paraphrased_sentences) | |
| # Apply minimal grammar fixes | |
| result = self.grammar_fixer.smart_fix(result) | |
| return result | |
| except Exception as e: | |
| print(f"Error in BART paraphrasing: {str(e)}") | |
| return text | |
| def apply_sentence_variation(self, text): | |
| """Apply natural sentence structure variations - HUMAN-LIKE FLOW""" | |
| sentences = self.split_into_sentences_advanced(text) | |
| varied_sentences = [] | |
| # Track patterns to ensure variety | |
| last_sentence_length = 0 | |
| for i, sentence in enumerate(sentences): | |
| if not sentence.strip(): | |
| continue | |
| words = sentence.split() | |
| current_length = len(words) | |
| # Natural sentence length variation | |
| if last_sentence_length > 20 and current_length > 20: | |
| # Break up if two long sentences in a row | |
| if ',' in sentence: | |
| parts = sentence.split(',', 1) | |
| if len(parts) == 2 and len(parts[1].split()) > 8: | |
| varied_sentences.append(parts[0].strip() + '.') | |
| second_part = parts[1].strip() | |
| if second_part and second_part[0].islower(): | |
| second_part = second_part[0].upper() + second_part[1:] | |
| varied_sentences.append(second_part) | |
| last_sentence_length = len(parts[1].split()) | |
| continue | |
| # Natural combinations for flow | |
| if (i < len(sentences) - 1 and | |
| current_length < 10 and | |
| len(sentences[i+1].split()) < 10): | |
| next_sent = sentences[i+1].strip() | |
| # Only combine if it makes semantic sense | |
| if next_sent and any(next_sent.lower().startswith(w) for w in ['it', 'this', 'that', 'which']): | |
| combined = sentence.rstrip('.') + ' ' + next_sent[0].lower() + next_sent[1:] | |
| varied_sentences.append(combined) | |
| sentences[i+1] = "" | |
| last_sentence_length = len(combined.split()) | |
| continue | |
| varied_sentences.append(sentence) | |
| last_sentence_length = current_length | |
| return ' '.join([s for s in varied_sentences if s]) | |
| def fix_punctuation(self, text): | |
| """Comprehensive punctuation and formatting fixes""" | |
| if not text: | |
| return "" | |
| # First, clean any remaining model artifacts | |
| text = self.clean_model_output_enhanced(text) | |
| # Fix weird symbols and characters using safe replacements | |
| text = text.replace('<>', '') # Remove empty angle brackets | |
| # Normalize quotes - use replace instead of regex for problematic characters | |
| text = text.replace('«', '"').replace('»', '"') | |
| text = text.replace('„', '"').replace('"', '"').replace('"', '"') | |
| text = text.replace(''', "'").replace(''', "'") | |
| text = text.replace('–', '-').replace('—', '-') | |
| # Fix colon issues | |
| text = re.sub(r'\.:', ':', text) # Remove period before colon | |
| text = re.sub(r':\s*\.', ':', text) # Remove period after colon | |
| # Fix basic spacing | |
| text = re.sub(r'\s+', ' ', text) # Multiple spaces to single | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Remove space before punctuation | |
| text = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', text) # Remove double punctuation | |
| text = re.sub(r'([.!?])\s*\1+', r'\1', text) # Remove repeated punctuation | |
| # Fix colons | |
| text = re.sub(r':\s*([.,!?])', ':', text) # Remove punctuation after colon | |
| text = re.sub(r'([.,!?])\s*:', ':', text) # Remove punctuation before colon | |
| text = re.sub(r':+', ':', text) # Multiple colons to one | |
| # Fix quotes and parentheses | |
| text = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', text) | |
| text = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", text) | |
| text = re.sub(r'\(\s*([^)]*?)\s*\)', r'(\1)', text) | |
| # Fix sentence capitalization more carefully | |
| # Split on ACTUAL sentence endings only | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| fixed_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| if not sentence: | |
| continue | |
| # Only capitalize the first letter if it's actually lowercase | |
| # and not part of a special case (like iPhone, eBay, etc.) | |
| words = sentence.split() | |
| if words: | |
| first_word = words[0] | |
| # Check if it's not an acronym or proper noun that should stay lowercase | |
| if (first_word[0].islower() and | |
| not self.is_likely_acronym_or_proper_noun(first_word)): | |
| # Only capitalize if it's a regular word | |
| sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:]) | |
| fixed_sentences.append(sentence) | |
| text = ' '.join(fixed_sentences) | |
| # Fix common issues | |
| text = re.sub(r'\bi\b', 'I', text) # Capitalize 'I' | |
| text = re.sub(r'\.{2,}', '.', text) # Multiple periods to one | |
| text = re.sub(r',{2,}', ',', text) # Multiple commas to one | |
| text = re.sub(r'\s*,\s*,\s*', ', ', text) # Double commas with spaces | |
| # Remove weird artifacts | |
| text = re.sub(r'\b(CHAPTER\s+[IVX]+|SECTION\s+\d+)\b[^\w]*', '', text, flags=re.IGNORECASE) | |
| # Fix abbreviations | |
| text = re.sub(r'\betc\s*\.\s*\.', 'etc.', text) | |
| text = re.sub(r'\be\.g\s*\.\s*[,\s]', 'e.g., ', text) | |
| text = re.sub(r'\bi\.e\s*\.\s*[,\s]', 'i.e., ', text) | |
| # Fix numbers with periods (like "1. " at start of lists) | |
| text = re.sub(r'(\d+)\.\s+', r'\1. ', text) | |
| # Fix bold/strong tags punctuation | |
| text = self.fix_bold_punctuation(text) | |
| # Clean up any remaining issues | |
| text = re.sub(r'\s+([.,!?;:])', r'\1', text) # Final space cleanup | |
| text = re.sub(r'([.,!?;:])\s{2,}', r'\1 ', text) # Fix multiple spaces after punctuation | |
| # Ensure ending punctuation | |
| text = text.strip() | |
| if text and text[-1] not in '.!?': | |
| # Don't add period if it ends with colon (likely a list header) | |
| if not text.endswith(':'): | |
| text += '.' | |
| return text | |
| def fix_bold_punctuation(self, text): | |
| """Fix punctuation issues around bold/strong tags""" | |
| # Check if this is likely a list item with colon pattern | |
| def is_list_item_with_colon(text): | |
| # Pattern: starts with or contains <strong>Text:</strong> or <b>Text:</b> | |
| list_pattern = r'^\s*(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>' | |
| return bool(re.search(list_pattern, text)) | |
| # If it's a list item with colon, preserve the format | |
| if is_list_item_with_colon(text): | |
| # Just clean up spacing but preserve the colon inside bold | |
| text = re.sub(r'<(strong|b)>\s*([^:]+)\s*:\s*</\1>', r'<\1>\2:</\1>', text) | |
| return text | |
| # Pattern to find bold/strong content | |
| bold_pattern = r'<(strong|b)>(.*?)</\1>' | |
| def fix_bold_match(match): | |
| tag = match.group(1) | |
| content = match.group(2).strip() | |
| if not content: | |
| return f'<{tag}></{tag}>' | |
| # Check if this is a list header (contains colon at the end) | |
| if content.endswith(':'): | |
| # Preserve list headers with colons | |
| return f'<{tag}>{content}</{tag}>' | |
| # Remove any periods at the start or end of bold content | |
| content = content.strip('.') | |
| # Check if this bold text is at the start of a sentence | |
| # (preceded by nothing, or by '. ', '! ', '? ') | |
| start_pos = match.start() | |
| is_sentence_start = (start_pos == 0 or | |
| (start_pos > 2 and text[start_pos-2:start_pos] in ['. ', '! ', '? ', '\n\n'])) | |
| # Capitalize first letter if it's at sentence start | |
| if is_sentence_start and content and content[0].isalpha(): | |
| content = content[0].upper() + content[1:] | |
| return f'<{tag}>{content}</{tag}>' | |
| # Fix bold/strong tags | |
| text = re.sub(bold_pattern, fix_bold_match, text) | |
| # Fix spacing around bold/strong tags (but not for list items) | |
| if not is_list_item_with_colon(text): | |
| text = re.sub(r'\.\s*<(strong|b)>', r'. <\1>', text) # Period before bold | |
| text = re.sub(r'</(strong|b)>\s*\.', r'</\1>.', text) # Period after bold | |
| text = re.sub(r'([.!?])\s*<(strong|b)>', r'\1 <\2>', text) # Space after sentence end | |
| text = re.sub(r'</(strong|b)>\s+([a-z])', lambda m: f'</{m.group(1)}> {m.group(2)}', text) # Keep lowercase after bold if mid-sentence | |
| # Remove duplicate periods around bold tags | |
| text = re.sub(r'\.\s*</(strong|b)>\s*\.', r'</\1>.', text) | |
| text = re.sub(r'\.\s*<(strong|b)>\s*\.', r'. <\1>', text) | |
| # Fix cases where bold content ends a sentence | |
| # If bold is followed by a new sentence (capital letter), add period | |
| text = re.sub(r'</(strong|b)>\s+([A-Z])', r'</\1>. \2', text) | |
| # Don't remove these for list items | |
| if not is_list_item_with_colon(text): | |
| text = re.sub(r'<(strong|b)>\s*:\s*</\1>', ':', text) # Remove empty bold colons | |
| text = re.sub(r'<(strong|b)>\s*\.\s*</\1>', '.', text) # Remove empty bold periods | |
| return text | |
| def extract_text_from_html(self, html_content): | |
| """Extract text elements from HTML with skip logic""" | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| text_elements = [] | |
| # Get all text nodes using string instead of text (fixing deprecation) | |
| for element in soup.find_all(string=True): | |
| # Skip script, style, and noscript content completely | |
| if element.parent.name in ['script', 'style', 'noscript']: | |
| continue | |
| text = element.strip() | |
| if text and not self.should_skip_element(element, text): | |
| text_elements.append({ | |
| 'text': text, | |
| 'element': element | |
| }) | |
| return soup, text_elements | |
| def validate_and_fix_html(self, html_text): | |
| """Fix common HTML syntax errors after processing""" | |
| # Fix DOCTYPE | |
| html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE) | |
| # Fix spacing issues | |
| html_text = re.sub(r'>\s+<', '><', html_text) # Remove extra spaces between tags | |
| html_text = re.sub(r'\s+>', '>', html_text) # Remove spaces before closing > | |
| html_text = re.sub(r'<\s+', '<', html_text) # Remove spaces after opening < | |
| # Fix common word errors that might occur during processing | |
| html_text = html_text.replace('down loaded', 'downloaded') | |
| html_text = html_text.replace('But your document', 'Your document') | |
| return html_text | |
| def add_natural_flow_variations(self, text): | |
| """Add more natural flow and rhythm variations for Originality AI""" | |
| sentences = self.split_into_sentences_advanced(text) | |
| enhanced_sentences = [] | |
| for i, sentence in enumerate(sentences): | |
| if not sentence.strip(): | |
| continue | |
| # Add stream-of-consciousness elements (8% chance - reduced) | |
| if random.random() < 0.08 and len(sentence.split()) > 10: | |
| stream_elements = [ | |
| " - wait, let me back up - ", | |
| " - actually, scratch that - ", | |
| " - or maybe I should say - ", | |
| " - hmm, how do I put this - ", | |
| " - okay, here's the thing - ", | |
| " - you know what I mean? - " | |
| ] | |
| words = sentence.split() | |
| pos = random.randint(len(words)//4, 3*len(words)//4) | |
| words.insert(pos, random.choice(stream_elements)) | |
| sentence = ' '.join(words) | |
| # Add human-like self-corrections (7% chance - reduced) | |
| if random.random() < 0.07: | |
| corrections = [ | |
| " - or rather, ", | |
| " - well, actually, ", | |
| " - I mean, ", | |
| " - or should I say, ", | |
| " - correction: " | |
| ] | |
| words = sentence.split() | |
| if len(words) > 8: | |
| pos = random.randint(len(words)//2, len(words)-3) | |
| correction = random.choice(corrections) | |
| # Repeat a concept with variation | |
| repeated_word_idx = random.randint(max(0, pos-5), pos-1) | |
| if repeated_word_idx < len(words): | |
| words.insert(pos, correction) | |
| sentence = ' '.join(words) | |
| # Add thinking-out-loud patterns (10% chance - reduced) | |
| if random.random() < 0.10 and i > 0: | |
| thinking_patterns = [ | |
| "Come to think of it, ", | |
| "Actually, you know what? ", | |
| "Wait, here's a thought: ", | |
| "Oh, and another thing - ", | |
| "Speaking of which, ", | |
| "This reminds me, ", | |
| "Now that I mention it, ", | |
| "Funny you should ask, because " | |
| ] | |
| pattern = random.choice(thinking_patterns) | |
| sentence = pattern + sentence[0].lower() + sentence[1:] if len(sentence) > 1 else sentence | |
| enhanced_sentences.append(sentence) | |
| return ' '.join(enhanced_sentences) | |
| def process_html(self, html_content, progress_callback=None): | |
| """Main processing function with progress callback""" | |
| if not html_content.strip(): | |
| return "Please provide HTML content." | |
| # Store all script and style content to preserve it | |
| script_placeholder = "###SCRIPT_PLACEHOLDER_{}###" | |
| style_placeholder = "###STYLE_PLACEHOLDER_{}###" | |
| preserved_scripts = [] | |
| preserved_styles = [] | |
| # Temporarily replace script and style tags with placeholders | |
| soup_temp = BeautifulSoup(html_content, 'html.parser') | |
| # Preserve all script tags | |
| for idx, script in enumerate(soup_temp.find_all('script')): | |
| placeholder = script_placeholder.format(idx) | |
| preserved_scripts.append(str(script)) | |
| script.replace_with(placeholder) | |
| # Preserve all style tags | |
| for idx, style in enumerate(soup_temp.find_all('style')): | |
| placeholder = style_placeholder.format(idx) | |
| preserved_styles.append(str(style)) | |
| style.replace_with(placeholder) | |
| # Get the modified HTML | |
| html_content = str(soup_temp) | |
| try: | |
| # Extract text elements | |
| soup, text_elements = self.extract_text_from_html(html_content) | |
| total_elements = len(text_elements) | |
| print(f"Found {total_elements} text elements to process (after filtering)") | |
| # Process each text element | |
| processed_count = 0 | |
| for i, element_info in enumerate(text_elements): | |
| original_text = element_info['text'] | |
| # Skip placeholders | |
| if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text: | |
| continue | |
| # Skip very short texts | |
| if len(original_text.split()) < 3: | |
| continue | |
| # First pass with Dipper | |
| paraphrased_text = self.paraphrase_with_dipper( | |
| original_text, | |
| lex_diversity=60, | |
| order_diversity=20 | |
| ) | |
| # Second pass with BART for longer texts (balanced probability) | |
| if self.use_bart and len(paraphrased_text.split()) > 8: | |
| # 30% chance to use BART for more variation (balanced) | |
| if random.random() < 0.3: | |
| paraphrased_text = self.paraphrase_with_bart(paraphrased_text) | |
| # Apply sentence variation | |
| paraphrased_text = self.apply_sentence_variation(paraphrased_text) | |
| # Add natural flow variations | |
| paraphrased_text = self.add_natural_flow_variations(paraphrased_text) | |
| # Fix punctuation and formatting | |
| paraphrased_text = self.fix_punctuation(paraphrased_text) | |
| # Final quality check | |
| if paraphrased_text and len(paraphrased_text.split()) >= 3: | |
| element_info['element'].replace_with(NavigableString(paraphrased_text)) | |
| processed_count += 1 | |
| # Progress update | |
| if progress_callback: | |
| progress_callback(i + 1, total_elements) | |
| if i % 10 == 0 or i == total_elements - 1: | |
| progress = (i + 1) / total_elements * 100 | |
| print(f"Progress: {progress:.1f}%") | |
| # Get the processed HTML | |
| result = str(soup) | |
| # Restore all script tags | |
| for idx, script_content in enumerate(preserved_scripts): | |
| placeholder = script_placeholder.format(idx) | |
| result = result.replace(placeholder, script_content) | |
| # Restore all style tags | |
| for idx, style_content in enumerate(preserved_styles): | |
| placeholder = style_placeholder.format(idx) | |
| result = result.replace(placeholder, style_content) | |
| # Post-process the entire HTML to fix bold/strong formatting | |
| result = self.post_process_html(result) | |
| # Validate and fix HTML syntax | |
| result = self.validate_and_fix_html(result) | |
| # Count skipped elements properly | |
| all_text_elements = soup.find_all(string=True) | |
| skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements | |
| print(f"Successfully processed {processed_count} text elements") | |
| print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)") | |
| print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags") | |
| return result | |
| except Exception as e: | |
| import traceback | |
| error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}" | |
| print(error_msg) | |
| # Return original HTML with error message prepended as HTML comment | |
| return f"<!-- {error_msg} -->\n{html_content}" | |
| def post_process_html(self, html_text): | |
| """Post-process the entire HTML to fix formatting issues""" | |
| # Fix empty angle brackets that might appear | |
| html_text = re.sub(r'<>\s*([^<>]+?)\s*(?=\.|\s|<)', r'\1', html_text) # Remove <> around text | |
| html_text = re.sub(r'<>', '', html_text) # Remove any remaining empty <> | |
| # Fix double angle brackets around bold tags | |
| html_text = re.sub(r'<<b>>', '<b>', html_text) | |
| html_text = re.sub(r'<</b>>', '</b>', html_text) | |
| html_text = re.sub(r'<<strong>>', '<strong>', html_text) | |
| html_text = re.sub(r'<</strong>>', '</strong>', html_text) | |
| # Fix periods around bold/strong tags | |
| html_text = re.sub(r'\.\s*<(b|strong)>', '. <\1>', html_text) # Period before bold | |
| html_text = re.sub(r'</(b|strong)>\s*\.', '</\1>.', html_text) # Period after bold | |
| html_text = re.sub(r'\.<<(b|strong)>>', '. <\1>', html_text) # Fix double bracket cases | |
| html_text = re.sub(r'</(b|strong)>>\.', '</\1>.', html_text) | |
| # Fix periods after colons | |
| html_text = re.sub(r':\s*\.', ':', html_text) | |
| html_text = re.sub(r'\.:', ':', html_text) | |
| # Check if a line is a list item | |
| def process_line(line): | |
| # Check if this line contains a list pattern with bold | |
| list_pattern = r'(?:^|\s)(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>' | |
| if re.search(list_pattern, line): | |
| # This is a list item, preserve the colon format | |
| return line | |
| # Not a list item, apply regular fixes | |
| # Remove periods immediately inside bold tags | |
| line = re.sub(r'<(strong|b)>\s*\.\s*([^<]+)\s*\.\s*</\1>', r'<\1>\2</\1>', line) | |
| # Fix sentence endings with bold | |
| line = re.sub(r'</(strong|b)>\s*([.!?])', r'</\1>\2', line) | |
| return line | |
| # Process line by line to preserve list formatting | |
| lines = html_text.split('\n') | |
| processed_lines = [process_line(line) for line in lines] | |
| html_text = '\n'.join(processed_lines) | |
| # Fix sentence starts with bold | |
| def fix_bold_sentence_start(match): | |
| pre_context = match.group(1) | |
| tag = match.group(2) | |
| content = match.group(3) | |
| # Skip if this is part of a list item with colon | |
| full_match = match.group(0) | |
| if ':' in full_match and '</' + tag + '>' in full_match: | |
| return full_match | |
| # Check if this should start with capital | |
| if pre_context == '' or pre_context.endswith(('.', '!', '?', '>')): | |
| if content and content[0].islower(): | |
| content = content[0].upper() + content[1:] | |
| return f'{pre_context}<{tag}>{content}' | |
| # Look for bold/strong tags and check their context | |
| html_text = re.sub(r'(^|.*?)(<(?:strong|b)>)([a-zA-Z])', fix_bold_sentence_start, html_text) | |
| # Clean up spacing around bold tags (but preserve list formatting) | |
| # Split into segments to handle list items separately | |
| segments = re.split(r'(<(?:strong|b)>[^<]*:</(?:strong|b)>)', html_text) | |
| cleaned_segments = [] | |
| for i, segment in enumerate(segments): | |
| if i % 2 == 1: # This is a list item pattern | |
| cleaned_segments.append(segment) | |
| else: | |
| # Apply spacing fixes to non-list segments | |
| segment = re.sub(r'\s+<(strong|b)>', r' <\1>', segment) | |
| segment = re.sub(r'</(strong|b)>\s+', r'</\1> ', segment) | |
| # Fix punctuation issues | |
| segment = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', segment) | |
| # Fix periods inside/around bold | |
| segment = re.sub(r'\.<(strong|b)>\.', '. <\1>', segment) | |
| segment = re.sub(r'\.</(strong|b)>\.', '</\1>.', segment) | |
| cleaned_segments.append(segment) | |
| html_text = ''.join(cleaned_segments) | |
| # Final cleanup | |
| html_text = re.sub(r'\.{2,}', '.', html_text) # Multiple periods | |
| html_text = re.sub(r',{2,}', ',', html_text) # Multiple commas | |
| html_text = re.sub(r':{2,}', ':', html_text) # Multiple colons | |
| html_text = re.sub(r'\s+([.,!?;:])', r'\1', html_text) # Space before punctuation | |
| # Fix empty bold tags (but not those with just colons) | |
| html_text = re.sub(r'<(strong|b)>\s*</\1>', '', html_text) | |
| # Fix specific patterns in lists/stats | |
| # Pattern like "5,000+" should not have period after | |
| html_text = re.sub(r'(\d+[,\d]*\+?)\s*\.\s*\n', r'\1\n', html_text) | |
| # Clean up any remaining double brackets | |
| html_text = re.sub(r'<<', '<', html_text) | |
| html_text = re.sub(r'>>', '>', html_text) | |
| # Apply final minimal grammar fixes | |
| html_text = self.grammar_fixer.smart_fix(html_text) | |
| return html_text | |
| # Initialize the humanizer | |
| humanizer = EnhancedDipperHumanizer() | |
| def humanize_html(html_input, progress=gr.Progress()): | |
| """Gradio interface function with progress updates""" | |
| if not html_input: | |
| return "Please provide HTML content to humanize." | |
| progress(0, desc="Starting processing...") | |
| start_time = time.time() | |
| # Create a wrapper to update progress | |
| def progress_callback(current, total): | |
| if total > 0: | |
| progress(current / total, desc=f"Processing: {current}/{total} elements") | |
| # Pass progress callback to process_html | |
| result = humanizer.process_html( | |
| html_input, | |
| progress_callback=progress_callback | |
| ) | |
| processing_time = time.time() - start_time | |
| print(f"Processing completed in {processing_time:.2f} seconds") | |
| progress(1.0, desc="Complete!") | |
| return result | |
| # Create Gradio interface with queue | |
| iface = gr.Interface( | |
| fn=humanize_html, | |
| inputs=[ | |
| gr.Textbox( | |
| lines=10, | |
| placeholder="Paste your HTML content here...", | |
| label="HTML Input" | |
| ) | |
| ], | |
| outputs=gr.Textbox( | |
| lines=10, | |
| label="Humanized HTML Output" | |
| ), | |
| title="Enhanced Dipper AI Humanizer - Optimized for Originality AI", | |
| description=""" | |
| Ultra-aggressive humanizer optimized to achieve 100% human scores on both Undetectable AI and Originality AI. | |
| Key Features: | |
| - Maximum diversity settings (90% lexical, 40% order) for natural variation | |
| - Enhanced human patterns: personal opinions, self-corrections, thinking-out-loud | |
| - Natural typos, contractions, and conversational flow | |
| - Stream-of-consciousness elements and rhetorical questions | |
| - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions | |
| - Skips content in <strong>, <b>, and heading tags (including inside tables) | |
| - Designed to pass the strictest AI detection systems | |
| The tool creates genuinely human-like writing patterns that fool even the most sophisticated detectors! | |
| ⚠️ Note: Processing may take 5-10 minutes for large HTML documents. | |
| """, | |
| examples=[ | |
| ["""<article> | |
| <h1>The Benefits of Regular Exercise</h1> | |
| <div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div> | |
| <p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p> | |
| <p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p> | |
| </article>"""] | |
| ], | |
| theme="default" | |
| ) | |
| if __name__ == "__main__": | |
| # Enable queue for better handling of long-running processes | |
| iface.queue(max_size=10) | |
| iface.launch(share=True) |