Spaces:

alexdev404
/

paraphraser2

Runtime error

App Files Files Community

alexdev404 commited on Oct 16

Commit

3b10d9f

verified ·

1 Parent(s): bca8c75

Update app.py

Browse files

Files changed (1) hide show

app.py +1718 -57

app.py CHANGED Viewed

@@ -1,70 +1,1731 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-def respond(
-    message,
-    history: list[dict[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-    hf_token: gr.OAuthToken,
-):
-    """
-    For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-    """
-    client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
-    messages = [{"role": "system", "content": system_message}]
-    messages.extend(history)
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        choices = message.choices
-        token = ""
-        if len(choices) and choices[0].delta.content:
-            token = choices[0].delta.content
-        response += token
-        yield response
-"""
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
-"""
-chatbot = gr.ChatInterface(
-    respond,
-    type="messages",
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
     ],
 )
-with gr.Blocks() as demo:
-    with gr.Sidebar():
-        gr.LoginButton()
-    chatbot.render()
 if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
+import torch
+from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
+from bs4 import BeautifulSoup, NavigableString, Tag
+import re
+import time
+import random
+import nltk
+from nltk.tokenize import sent_tokenize
+# Download required NLTK data
+try:
+    nltk.download('punkt', quiet=True)
+except:
+    pass
+# Try to import spaCy but make it optional
+try:
+    import spacy
+    SPACY_AVAILABLE = True
+except:
+    print("spaCy not available, using NLTK for sentence processing")
+    SPACY_AVAILABLE = False
+class HumanLikeVariations:
+    """Add human-like variations and intentional imperfections"""
+    def __init__(self):
+        # Common human writing patterns - EXPANDED for Originality AI
+        self.casual_transitions = [
+             "So, ", "Well, ", "Now, ", "Actually, ", "Basically, ",
+             "You know, ", "I mean, ", "Thing is, ", "Honestly, ",
+             "Look, ", "Listen, ", "See, ", "Okay, ", "Right, ",
+             "Anyway, ", "Besides, ", "Plus, ", "Also, ", "Oh, ",
+             "Hey, ", "Alright, ", "Sure, ", "Fine, ", "Obviously, ",
+             "Clearly, ", "Seriously, ", "Literally, ", "Frankly, ",
+             "To be honest, ", "Truth is, ", "In fact, ", "Believe it or not, ",
+             "Here's the thing, ", "Let me tell you, ", "Get this, ",
+             "Funny thing is, ", "Interestingly, ", "Surprisingly, ",
+             "Let's be real here, ", "Can we talk about ", "Quick question: ",
+             "Real talk: ", "Hot take: ", "Unpopular opinion: ", "Fun fact: ",
+             "Pro tip: ", "Side note: ", "Random thought: ", "Food for thought: ",
+             "Just saying, ", "Not gonna lie, ", "For what it's worth, ",
+             "If you ask me, ", "Between you and me, ", "Here's my take: ",
+             "Let's face it, ", "No kidding, ", "Seriously though, ",
+             "But wait, ", "Hold on, ", "Check this out: ", "Guess what? "
+        ]
+        self.filler_phrases = [
+            "kind of", "sort of", "pretty much", "basically", "actually",
+            "really", "just", "quite", "rather", "fairly", "totally",
+            "definitely", "probably", "maybe", "perhaps", "somehow",
+            "somewhat", "literally", "seriously", "honestly", "frankly",
+            "simply", "merely", "purely", "truly", "genuinely",
+            "absolutely", "completely", "entirely", "utterly", "practically",
+            "virtually", "essentially", "fundamentally", "generally", "typically",
+            "usually", "normally", "often", "sometimes", "occasionally",
+            "apparently", "evidently", "obviously", "clearly", "seemingly",
+            "arguably", "potentially", "possibly", "likely", "unlikely",
+            "more or less", "give or take", "so to speak", "if you will",
+            "per se", "as such", "in a way", "to some extent", "to a degree",
+            "I kid you not", "no joke", "for real", "not gonna lie",
+            "I'm telling you", "trust me", "believe me", "I swear",
+            "hands down", "without a doubt", "100%", "straight up",
+            "I think", "I feel like", "I guess", "I suppose", "seems like",
+            "appears to be", "might be", "could be", "tends to", "tends to be",
+            "in my experience", "from what I've seen", "as far as I know",
+            "to the best of my knowledge", "if I'm not mistaken", "correct me if I'm wrong",
+            "you know what", "here's the deal", "bottom line", "at any rate",
+            "all in all", "when you think about it", "come to think of it",
+            "now that I think about it", "if we're being honest", "to be fair"
+        ]
+        self.human_connectors = [
+            ", which means", ", so", ", because", ", since", ", although",
+            ". That's why", ". This means", ". So basically,", ". The thing is,",
+            ", and honestly", ", but here's the thing", ", though", ", however",
+            ". Plus,", ". Also,", ". Besides,", ". Moreover,", ". Furthermore,",
+            ", which is why", ", and that's because", ", given that", ", considering",
+            ". In other words,", ". Put simply,", ". To clarify,", ". That said,",
+            ", you see", ", you know", ", right?", ", okay?", ", yeah?",
+            ". Here's why:", ". Let me explain:", ". Think about it:",
+            ", if you ask me", ", in my opinion", ", from my perspective",
+            ". On the flip side,", ". On the other hand,", ". Conversely,",
+            ", not to mention", ", let alone", ", much less", ", aside from",
+            ". What's more,", ". Even better,", ". Even worse,", ". The catch is,",
+            ", believe it or not", ", surprisingly enough", ", interestingly enough",
+            ". Long story short,", ". Bottom line is,", ". Point being,",
+            ", as you might expect", ", as it turns out", ", as luck would have it",
+            ". And get this:", ". But wait, there's more:", ". Here's the kicker:",
+            ", and here's why", ", and here's the thing", ", but here's what happened",
+            ". Spoiler alert:", ". Plot twist:", ". Reality check:",
+            ", at the end of the day", ", when all is said and done", ", all things considered",
+            ". Make no mistake,", ". Don't get me wrong,", ". Let's not forget,",
+            ", between you and me", ", off the record", ", just between us",
+            ". And honestly?", ". But seriously,", ". And you know what?",
+            ", which brings me to", ". This reminds me of", ", speaking of which",
+            ". Funny enough,", ". Weird thing is,", ". Strange but true:",
+            ", and I mean", ". I'm not kidding when I say", ", and trust me on this"
+        ]
+        # NEW: Common human typos and variations
+        self.common_typos = {
+            "the": ["teh", "th", "hte"],
+            "and": ["adn", "nad", "an"],
+            "that": ["taht", "htat", "tha"],
+            "with": ["wiht", "wtih", "iwth"],
+            "have": ["ahve", "hvae", "hav"],
+            "from": ["form", "fro", "frmo"],
+            "they": ["tehy", "thye", "htey"],
+            "which": ["whihc", "wich", "whcih"],
+            "their": ["thier", "theri", "tehir"],
+            "would": ["woudl", "wuold", "woul"],
+            "there": ["tehre", "theer", "ther"],
+            "could": ["coudl", "cuold", "coud"],
+            "people": ["poeple", "peopel", "pepole"],
+            "through": ["thorugh", "throught", "trhough"],
+            "because": ["becuase", "becasue", "beacuse"],
+            "before": ["beofre", "befroe", "befor"],
+            "different": ["differnt", "differnet", "diferent"],
+            "between": ["bewteen", "betwen", "betewen"],
+            "important": ["improtant", "importnat", "importan"],
+            "information": ["infromation", "informaiton", "informaton"]
+        }
+        # NEW: Human-like sentence starters for variety
+        self.varied_starters = [
+            "When it comes to", "As for", "Regarding", "In terms of",
+            "With respect to", "Concerning", "Speaking of", "About",
+            "If we look at", "Looking at", "Considering", "Given",
+            "Taking into account", "Bear in mind that", "Keep in mind",
+            "It's worth noting that", "It should be noted that",
+            "One thing to consider is", "An important point is",
+            "What's interesting is", "What stands out is",
+            "The key here is", "The main thing is", "The point is",
+            "Here's what matters:", "Here's the deal:", "Here's something:",
+            "Let's not forget", "We should remember", "Don't forget",
+            "Think about it this way:", "Look at it like this:",
+            "Consider this:", "Picture this:", "Imagine this:",
+            "You might wonder", "You might ask", "You may think",
+            "Some people say", "Many believe", "It's often said",
+            "Research shows", "Studies indicate", "Evidence suggests",
+            "Experience tells us", "History shows", "Time has shown"
+        ]
+    def add_human_touch(self, text):
+        """Add subtle human-like imperfections - NATURAL PATTERNS ONLY"""
+        sentences = text.split('. ')
+        modified_sentences = []
+        # Track what we've used to avoid patterns
+        used_transitions = []
+        for i, sent in enumerate(sentences):
+            if not sent.strip():
+                continue
+            # Always use contractions where natural
+            sent = self.apply_contractions(sent)
+            # Add VERY occasional natural errors (5% chance)
+            if random.random() < 0.05 and len(sent.split()) > 15:
+                error_types = [
+                    # Missing comma in compound sentence
+                    lambda s: s.replace(", and", " and", 1) if ", and" in s else s,
+                    # Wrong homophone
+                    lambda s: s.replace("their", "there", 1) if "their" in s and random.random() < 0.3 else s,
+                    # Missing apostrophe
+                    lambda s: s.replace("it's", "its", 1) if "it's" in s and random.random() < 0.3 else s,
+                ]
+                error_func = random.choice(error_types)
+                sent = error_func(sent)
+            modified_sentences.append(sent)
+        return '. '.join(modified_sentences)
+    def apply_contractions(self, text):
+        """Apply common contractions - EXPANDED"""
+        contractions = {
+            "it is": "it's", "that is": "that's", "there is": "there's",
+            "he is": "he's", "she is": "she's", "what is": "what's",
+            "where is": "where's", "who is": "who's", "how is": "how's",
+            "cannot": "can't", "will not": "won't", "do not": "don't",
+            "does not": "doesn't", "did not": "didn't", "could not": "couldn't",
+            "should not": "shouldn't", "would not": "wouldn't", "is not": "isn't",
+            "are not": "aren't", "was not": "wasn't", "were not": "weren't",
+            "have not": "haven't", "has not": "hasn't", "had not": "hadn't",
+            "I am": "I'm", "you are": "you're", "we are": "we're",
+            "they are": "they're", "I have": "I've", "you have": "you've",
+            "we have": "we've", "they have": "they've", "I will": "I'll",
+            "you will": "you'll", "he will": "he'll", "she will": "she'll",
+            "we will": "we'll", "they will": "they'll", "I would": "I'd",
+            "you would": "you'd", "he would": "he'd", "she would": "she'd",
+            "we would": "we'd", "they would": "they'd", "could have": "could've",
+            "should have": "should've", "would have": "would've", "might have": "might've",
+            "must have": "must've", "there has": "there's", "here is": "here's",
+            "let us": "let's", "that will": "that'll", "who will": "who'll"
+        }
+        for full, contr in contractions.items():
+            if random.random() < 0.8:  # 80% chance to apply each contraction
+                text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE)
+        return text
+    def add_minor_errors(self, text):
+        """Add very minor, human-like errors - MORE REALISTIC BUT CONTROLLED"""
+        # Occasionally miss Oxford comma (15% chance)
+        if random.random() < 0.15:
+            # Only in lists, not random commas
+            text = re.sub(r'(\w+), (\w+), and (\w+)', r'\1, \2 and \3', text)
+        # Sometimes use 'which' instead of 'that' (8% chance)
+        if random.random() < 0.08:
+            # Only for non-restrictive clauses
+            matches = re.finditer(r'\b(\w+) that (\w+)', text)
+            for match in list(matches)[:1]:  # Only first occurrence
+                if match.group(1).lower() not in ['believe', 'think', 'know', 'say']:
+                    text = text.replace(match.group(0), f"{match.group(1)} which {match.group(2)}", 1)
+        # NEW: Add very occasional typos (2% chance per sentence) - REDUCED AND CONTROLLED
+        sentences = text.split('. ')
+        for i, sent in enumerate(sentences):
+            if random.random() < 0.02 and len(sent.split()) > 15:  # Only in longer sentences
+                words = sent.split()
+                # Pick a random word to potentially typo
+                word_idx = random.randint(len(words)//2, len(words)-2)  # Avoid start/end
+                word = words[word_idx].lower()
+                # Only typo common words where typo won't break meaning
+                safe_typos = {
+                    'the': 'teh',
+                    'and': 'adn',
+                    'that': 'taht',
+                    'with': 'wtih',
+                    'from': 'form',
+                    'because': 'becuase'
+                }
+                if word in safe_typos and random.random() < 0.5:
+                    typo = safe_typos[word]
+                    # Preserve original capitalization
+                    if words[word_idx][0].isupper():
+                        typo = typo[0].upper() + typo[1:]
+                    words[word_idx] = typo
+                    sentences[i] = ' '.join(words)
+        text = '. '.join(sentences)
+        # Skip double words - too distracting
+        # Mix up common homophones occasionally (2% chance) - ONLY SAFE ONES
+        if random.random() < 0.02:
+            safe_homophones = [
+                ('its', "it's"),  # Very common mistake
+                ('your', "you're"),  # Another common one
+            ]
+            for pair in safe_homophones:
+                # Check context to avoid breaking meaning
+                if f" {pair[0]} " in text and random.random() < 0.3:
+                    # Find one instance and check it's safe to replace
+                    pattern = rf'\b{pair[0]}\s+(\w+ing|\w+ed)\b'  # its + verb = likely should be it's
+                    if re.search(pattern, text):
+                        text = re.sub(pattern, f"{pair[1]} \\1", text, count=1)
+                        break
+        return text
+    def add_natural_human_patterns(self, text):
+        """Add natural human writing patterns that Originality AI associates with human text"""
+        sentences = self.split_into_sentences_advanced(text)
+        result_sentences = []
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            # Natural contractions throughout
+            sentence = self.apply_contractions(sentence)
+            # Add natural speech patterns (15% chance)
+            if random.random() < 0.15 and len(sentence.split()) > 10:
+                # Natural interruptions that humans actually use
+                if random.random() < 0.5:
+                    # Add "you know" or "I mean" naturally
+                    words = sentence.split()
+                    if len(words) > 6:
+                        pos = random.randint(3, len(words)-3)
+                        if random.random() < 0.5:
+                            words.insert(pos, "you know,")
+                        else:
+                            words.insert(pos, "I mean,")
+                        sentence = ' '.join(words)
+                else:
+                    # Start with natural opener
+                    openers = ["Look,", "See,", "Thing is,", "Honestly,", "Actually,"]
+                    sentence = random.choice(openers) + " " + sentence[0].lower() + sentence[1:]
+            # Add subtle errors that humans make (10% chance - reduced)
+            if random.random() < 0.10:
+                words = sentence.split()
+                if len(words) > 5:
+                    # Common comma omissions
+                    if ", and" in sentence and random.random() < 0.3:
+                        sentence = sentence.replace(", and", " and", 1)
+                    # Double words occasionally
+                    elif random.random() < 0.2:
+                        idx = random.randint(1, len(words)-2)
+                        if words[idx].lower() in ['the', 'a', 'to', 'in', 'on', 'at']:
+                            words.insert(idx+1, words[idx])
+                            sentence = ' '.join(words)
+            # Natural sentence combinations (20% chance)
+            if i < len(sentences) - 1 and random.random() < 0.2:
+                next_sent = sentences[i+1].strip()
+                if next_sent and len(sentence.split()) + len(next_sent.split()) < 25:
+                    # Natural connectors based on content
+                    if any(w in next_sent.lower() for w in ['but', 'however', 'although']):
+                        sentence = sentence.rstrip('.') + ", but " + next_sent[0].lower() + next_sent[1:]
+                        sentences[i+1] = ""  # Mark as processed
+                    elif any(w in next_sent.lower() for w in ['also', 'too', 'as well']):
+                        sentence = sentence.rstrip('.') + " and " + next_sent[0].lower() + next_sent[1:]
+                        sentences[i+1] = ""  # Mark as processed
+            result_sentences.append(sentence)
+        return ' '.join([s for s in result_sentences if s])
+    def vary_sentence_start(self, sentence):
+        """Vary sentence beginning to avoid repetitive patterns"""
+        if not sentence:
+            return sentence
+        words = sentence.split()
+        if len(words) < 5:
+            return sentence
+        # Different ways to start sentences naturally
+        variations = [
+            lambda s: "When " + s[0].lower() + s[1:] + ", it makes sense.",
+            lambda s: "If you think about it, " + s[0].lower() + s[1:],
+            lambda s: s + " This is important.",
+            lambda s: "The thing about " + words[0].lower() + " " + ' '.join(words[1:]) + " is clear.",
+            lambda s: "What's interesting is " + s[0].lower() + s[1:],
+            lambda s: s,  # Keep original sometimes
+        ]
+        # Pick a random variation
+        variation = random.choice(variations)
+        try:
+            return variation(sentence)
+        except:
+            return sentence
+    def split_into_sentences_advanced(self, text):
+        """Advanced sentence splitting using spaCy or NLTK"""
+        if SPACY_AVAILABLE:
+            try:
+                nlp = spacy.load("en_core_web_sm")
+                doc = nlp(text)
+                sentences = [sent.text.strip() for sent in doc.sents]
+            except:
+                sentences = sent_tokenize(text)
+        else:
+            # Fallback to NLTK
+            try:
+                sentences = sent_tokenize(text)
+            except:
+                # Final fallback to regex
+                sentences = re.split(r'(?<=[.!?])\s+', text)
+        # Clean up sentences
+        return [s for s in sentences if s and len(s.strip()) > 0]
+class SelectiveGrammarFixer:
+    """Minimal grammar fixes to maintain human-like quality while fixing critical errors"""
+    def __init__(self):
+        self.nlp = None
+        self.human_variations = HumanLikeVariations()
+    def fix_incomplete_sentences_only(self, text):
+        """Fix only incomplete sentences without over-correcting"""
+        if not text:
+            return text
+        sentences = text.split('. ')
+        fixed_sentences = []
+        for i, sent in enumerate(sentences):
+            sent = sent.strip()
+            if not sent:
+                continue
+            # Only fix if sentence is incomplete
+            if sent and sent[-1] not in '.!?':
+                # Check if it's the last sentence
+                if i == len(sentences) - 1:
+                    # Add period if it's clearly a statement
+                    if not sent.endswith(':') and not sent.endswith(','):
+                        sent += '.'
+                else:
+                    # Middle sentences should have periods
+                    sent += '.'
+            # Ensure first letter capitalization ONLY after sentence endings
+            if i > 0 and sent and sent[0].islower():
+                # Check if previous sentence ended with punctuation
+                if fixed_sentences and fixed_sentences[-1].rstrip().endswith(('.', '!', '?')):
+                    sent = sent[0].upper() + sent[1:]
+            elif i == 0 and sent and sent[0].islower():
+                # First sentence should be capitalized
+                sent = sent[0].upper() + sent[1:]
+            fixed_sentences.append(sent)
+        result = ' '.join(fixed_sentences)
+        # Add natural human variations (but we need to reference the main class method)
+        # This will be called from the smart_fix method instead
+        return result
+    def fix_basic_punctuation_errors(self, text):
+        """Fix only the most egregious punctuation errors"""
+        if not text:
+            return text
+        # Fix double spaces (human-like error)
+        text = re.sub(r'\s{2,}', ' ', text)
+        # Fix space before punctuation (common error)
+        text = re.sub(r'\s+([.,!?;:])', r'\1', text)
+        # Fix missing space after punctuation (human-like)
+        text = re.sub(r'([.,!?])([A-Z])', r'\1 \2', text)
+        # Fix accidental double punctuation
+        text = re.sub(r'([.!?])\1+', r'\1', text)
+        # Fix "i" capitalization (common human error to fix)
+        text = re.sub(r'\bi\b', 'I', text)
+        return text
+    def preserve_natural_variations(self, text):
+        """Keep some natural human-like variations"""
+        # Don't fix everything - leave some variety
+        # Only fix if really broken
+        if text.count('.') == 0 and len(text.split()) > 20:
+            # Long text with no periods - needs fixing
+            words = text.split()
+            # Add periods every 15-25 words naturally (more variation)
+            new_text = []
+            for i, word in enumerate(words):
+                new_text.append(word)
+                if i > 0 and i % random.randint(12, 25) == 0:
+                    if word[-1] not in '.!?,;:':
+                        new_text[-1] = word + '.'
+                        # Capitalize next word if it's not an acronym
+                        if i + 1 < len(words) and words[i + 1][0].islower():
+                            # Check if it's not likely an acronym
+                            if not words[i + 1].isupper():
+                                words[i + 1] = words[i + 1][0].upper() + words[i + 1][1:]
+            text = ' '.join(new_text)
+        return text
+    def smart_fix(self, text):
+        """Apply minimal fixes to maintain human-like quality"""
+        # Apply fixes in order of importance
+        text = self.fix_basic_punctuation_errors(text)
+        text = self.fix_incomplete_sentences_only(text)
+        text = self.preserve_natural_variations(text)
+        return text
+class EnhancedDipperHumanizer:
+    def __init__(self):
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        print(f"Using device: {self.device}")
+        # Clear GPU cache
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        # Initialize grammar fixer
+        self.grammar_fixer = SelectiveGrammarFixer()
+        # Try to load spaCy if available
+        self.nlp = None
+        self.use_spacy = False
+        if SPACY_AVAILABLE:
+            try:
+                self.nlp = spacy.load("en_core_web_sm")
+                self.use_spacy = True
+                print("spaCy loaded successfully")
+            except:
+                print("spaCy model not found, using NLTK for sentence splitting")
+        try:
+            # Load Dipper paraphraser WITHOUT 8-bit quantization for better performance
+            print("Loading Dipper paraphraser model...")
+            self.tokenizer = T5Tokenizer.from_pretrained('google/t5-v1_1-xxl')
+            self.model = T5ForConditionalGeneration.from_pretrained(
+                "kalpeshk2011/dipper-paraphraser-xxl",
+                device_map="auto",  # This will distribute across 4xL40S automatically
+                torch_dtype=torch.float16,
+                low_cpu_mem_usage=True
+            )
+            print("Dipper model loaded successfully!")
+            self.is_dipper = True
+        except Exception as e:
+            print(f"Error loading Dipper model: {str(e)}")
+            print("Falling back to Flan-T5-XL...")
+            self.is_dipper = False
+            # Fallback to Flan-T5-XL
+            try:
+                self.model = T5ForConditionalGeneration.from_pretrained(
+                    "google/flan-t5-xl",
+                    torch_dtype=torch.float16,
+                    low_cpu_mem_usage=True,
+                    device_map="auto"
+                )
+                self.tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-xl")
+                print("Loaded Flan-T5-XL as fallback")
+            except:
+                raise Exception("Could not load any model. Please check your system resources.")
+        # Load BART as secondary model
+        try:
+            print("Loading BART model for additional variation...")
+            self.bart_model = AutoModelForSeq2SeqLM.from_pretrained(
+                "eugenesiow/bart-paraphrase",
+                torch_dtype=torch.float16,
+                device_map="auto"  # Distribute across GPUs
+            )
+            self.bart_tokenizer = AutoTokenizer.from_pretrained("eugenesiow/bart-paraphrase")
+            self.use_bart = True
+            print("BART model loaded successfully")
+        except:
+            print("BART model not available")
+            self.use_bart = False
+        # Initialize human variations handler
+        self.human_variations = HumanLikeVariations()
+    def add_natural_human_patterns(self, text):
+        """Add natural human writing patterns that Originality AI associates with human text"""
+        sentences = self.split_into_sentences_advanced(text)
+        result_sentences = []
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            # Natural contractions throughout
+            sentence = self.apply_contractions(sentence)
+            # Add natural speech patterns (15% chance - balanced)
+            if random.random() < 0.15 and len(sentence.split()) > 10:
+                # Natural interruptions that humans actually use
+                if random.random() < 0.5:
+                    # Add "you know" or "I mean" naturally
+                    words = sentence.split()
+                    if len(words) > 6:
+                        pos = random.randint(3, len(words)-3)
+                        if random.random() < 0.5:
+                            words.insert(pos, "you know,")
+                        else:
+                            words.insert(pos, "I mean,")
+                        sentence = ' '.join(words)
+                else:
+                    # Start with natural opener
+                    openers = ["Look,", "See,", "Thing is,", "Honestly,", "Actually,"]
+                    sentence = random.choice(openers) + " " + sentence[0].lower() + sentence[1:]
+            # Add subtle errors that humans make (8% chance)
+            if random.random() < 0.08:
+                words = sentence.split()
+                if len(words) > 5:
+                    # Common comma omissions
+                    if ", and" in sentence and random.random() < 0.3:
+                        sentence = sentence.replace(", and", " and", 1)
+                    # Double words occasionally
+                    elif random.random() < 0.2:
+                        idx = random.randint(1, len(words)-2)
+                        if words[idx].lower() in ['the', 'a', 'to', 'in', 'on', 'at']:
+                            words.insert(idx+1, words[idx])
+                            sentence = ' '.join(words)
+            # Natural sentence combinations (20% chance)
+            if i < len(sentences) - 1 and random.random() < 0.2:
+                next_sent = sentences[i+1].strip()
+                if next_sent and len(sentence.split()) + len(next_sent.split()) < 25:
+                    # Natural connectors based on content
+                    if any(w in next_sent.lower() for w in ['but', 'however', 'although']):
+                        sentence = sentence.rstrip('.') + ", but " + next_sent[0].lower() + next_sent[1:]
+                        sentences[i+1] = ""  # Mark as processed
+                    elif any(w in next_sent.lower() for w in ['also', 'too', 'as well']):
+                        sentence = sentence.rstrip('.') + " and " + next_sent[0].lower() + next_sent[1:]
+                        sentences[i+1] = ""  # Mark as processed
+            result_sentences.append(sentence)
+        return ' '.join([s for s in result_sentences if s])
+    def vary_sentence_start(self, sentence):
+        """Vary sentence beginning to avoid repetitive patterns"""
+        if not sentence:
+            return sentence
+        words = sentence.split()
+        if len(words) < 5:
+            return sentence
+        # Different ways to start sentences naturally
+        variations = [
+            lambda s: "When " + s[0].lower() + s[1:] + ", it makes sense.",
+            lambda s: "If you think about it, " + s[0].lower() + s[1:],
+            lambda s: s + " This is important.",
+            lambda s: "The thing about " + words[0].lower() + " " + ' '.join(words[1:]) + " is clear.",
+            lambda s: "What's interesting is " + s[0].lower() + s[1:],
+            lambda s: s,  # Keep original sometimes
+        ]
+        # Pick a random variation
+        variation = random.choice(variations)
+        try:
+            return variation(sentence)
+        except:
+            return sentence
+    def apply_contractions(self, text):
+        """Apply common contractions to make text more natural"""
+        contractions = {
+            "it is": "it's", "that is": "that's", "there is": "there's",
+            "he is": "he's", "she is": "she's", "what is": "what's",
+            "where is": "where's", "who is": "who's", "how is": "how's",
+            "cannot": "can't", "will not": "won't", "do not": "don't",
+            "does not": "doesn't", "did not": "didn't", "could not": "couldn't",
+            "should not": "shouldn't", "would not": "wouldn't", "is not": "isn't",
+            "are not": "aren't", "was not": "wasn't", "were not": "weren't",
+            "have not": "haven't", "has not": "hasn't", "had not": "hadn't",
+            "I am": "I'm", "you are": "you're", "we are": "we're",
+            "they are": "they're", "I have": "I've", "you have": "you've",
+            "we have": "we've", "they have": "they've", "I will": "I'll",
+            "you will": "you'll", "he will": "he'll", "she will": "she'll",
+            "we will": "we'll", "they will": "they'll", "I would": "I'd",
+            "you would": "you'd", "he would": "he'd", "she would": "she'd",
+            "we would": "we'd", "they would": "they'd", "could have": "could've",
+            "should have": "should've", "would have": "would've", "might have": "might've",
+            "must have": "must've", "there has": "there's", "here is": "here's",
+            "let us": "let's", "that will": "that'll", "who will": "who'll"
+        }
+        for full, contr in contractions.items():
+            text = re.sub(r'\b' + full + r'\b', contr, text, flags=re.IGNORECASE)
+        return text
+    def should_skip_element(self, element, text):
+        """Determine if an element should be skipped from paraphrasing"""
+        if not text or len(text.strip()) < 3:
+            return True
+        # Skip JavaScript code inside script tags
+        parent = element.parent
+        if parent and parent.name in ['script', 'style', 'noscript']:
+            return True
+        # Skip headings (h1-h6)
+        if parent and parent.name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'title']:
+            return True
+        # Skip content inside <strong> and <b> tags
+        if parent and parent.name in ['strong', 'b']:
+            return True
+        # Skip table content
+        if parent and (parent.name in ['td', 'th'] or any(p.name == 'table' for p in parent.parents)):
+            return True
+        # Special handling for content inside tables
+        # Skip if it's inside strong/b/h1-h6 tags AND also inside a table
+        if parent:
+            # Check if we're inside a table
+            is_in_table = any(p.name == 'table' for p in parent.parents)
+            if is_in_table:
+                # If we're in a table, skip any text that's inside formatting tags
+                if parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'em', 'i']:
+                    return True
+                # Also check if parent's parent is a formatting tag
+                if parent.parent and parent.parent.name in ['strong', 'b', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
+                    return True
+        # Skip table of contents
+        if parent:
+            parent_text = str(parent).lower()
+            if any(toc in parent_text for toc in ['table of contents', 'toc-', 'contents']):
+                return True
+        # Skip CTAs and buttons
+        if parent and parent.name in ['button', 'a']:
+            return True
+        # Skip if parent has onclick or other event handlers
+        if parent and parent.attrs:
+            event_handlers = ['onclick', 'onchange', 'onsubmit', 'onload', 'onmouseover', 'onmouseout']
+            if any(handler in parent.attrs for handler in event_handlers):
+                return True
+        # Special check for testimonial cards - check up to 3 levels of ancestors
+        if parent:
+            ancestors_to_check = []
+            current = parent
+            for _ in range(3):  # Check up to 3 levels up
+                if current:
+                    ancestors_to_check.append(current)
+                    current = current.parent
+            # Check if any ancestor has testimonial-card class
+            for ancestor in ancestors_to_check:
+                if ancestor and ancestor.get('class'):
+                    classes = ancestor.get('class', [])
+                    if isinstance(classes, list):
+                        if any('testimonial-card' in str(cls) for cls in classes):
+                            return True
+                    elif isinstance(classes, str) and 'testimonial-card' in classes:
+                        return True
+        # Skip if IMMEDIATE parent or element itself has skip-worthy classes/IDs
+        skip_indicators = [
+            'button', 'btn', 'heading', 'title', 'caption',
+            'toc-', 'contents', 'quiz', 'tip', 'note', 'alert',
+            'warning', 'info', 'success', 'error', 'code', 'pre',
+            'stats-grid', 'testimonial-card',
+            'cta-box', 'quiz-container', 'contact-form',
+            'faq-question', 'sidebar', 'widget', 'banner',
+            'author-intro', 'testimonial', 'review', 'feedback',
+            'floating-', 'stat-', 'progress-', 'option', 'results',
+            'question-container', 'quiz-',
+            'comparision-tables', 'process-flowcharts', 'infographics', 'cost-breakdown'
+        ]
+        # Check only immediate parent and grandparent (not all ancestors)
+        elements_to_check = [parent]
+        if parent and parent.parent:
+            elements_to_check.append(parent.parent)
+        for elem in elements_to_check:
+            if not elem:
+                continue
+            # Check element's class
+            elem_class = elem.get('class', [])
+            if isinstance(elem_class, list):
+                class_str = ' '.join(str(cls).lower() for cls in elem_class)
+                if any(indicator in class_str for indicator in skip_indicators):
+                    return True
+            # Check element's ID
+            elem_id = elem.get('id', '')
+            if any(indicator in str(elem_id).lower() for indicator in skip_indicators):
+                return True
+        # Skip short phrases that might be UI elements
+        word_count = len(text.split())
+        if word_count <= 5:
+            ui_patterns = [
+                'click', 'download', 'learn more', 'read more', 'sign up',
+                'get started', 'try now', 'buy now', 'next', 'previous',
+                'back', 'continue', 'submit', 'cancel', 'get now', 'book your',
+                'check out:', 'see also:', 'related:', 'question', 'of'
+            ]
+            if any(pattern in text.lower() for pattern in ui_patterns):
+                return True
+        # Skip very short content in styled containers
+        if parent and parent.name in ['div', 'section', 'aside', 'blockquote']:
+            style = parent.get('style', '')
+            if 'border' in style or 'background' in style:
+                if word_count <= 20:
+                    # But don't skip if it's inside a paragraph
+                    if not any(p.name == 'p' for p in parent.parents):
+                        return True
+        return False
+    def is_likely_acronym_or_proper_noun(self, word):
+        """Check if a word is likely an acronym or part of a proper noun"""
+        # Common acronyms and abbreviations
+        acronyms = {'MBA', 'CEO', 'USA', 'UK', 'GMAT', 'GRE', 'SAT', 'ACT', 'PhD', 'MD', 'IT', 'AI', 'ML'}
+        # Check if it's in our acronym list
+        if word.upper() in acronyms:
+            return True
+        # Check if it's all caps (likely acronym)
+        if word.isupper() and len(word) > 1:
+            return True
+        # Check if it follows patterns like "Edition", "Focus", etc. that often come after proper nouns
+        proper_noun_continuations = {
+            'Edition', 'Version', 'Series', 'Focus', 'System', 'Method', 'School',
+            'University', 'College', 'Institute', 'Academy', 'Center', 'Centre'
+        }
+        if word in proper_noun_continuations:
+            return True
+        return False
+    def clean_model_output_enhanced(self, text):
+        """Enhanced cleaning that preserves more natural structure"""
+        if not text:
+            return ""
+        # Store original for fallback
+        original = text
+        # Remove ONLY clear model artifacts
+        text = re.sub(r'^lexical\s*=\s*\d+\s*,\s*order\s*=\s*\d+\s*', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'<sent>\s*', '', text, flags=re.IGNORECASE)
+        text = re.sub(r'\s*</sent>', '', text, flags=re.IGNORECASE)
+        # Only remove clear prefixes
+        if text.lower().startswith('paraphrase:'):
+            text = text[11:].strip()
+        elif text.lower().startswith('rewrite:'):
+            text = text[8:].strip()
+        # Clean up backticks and weird punctuation
+        text = re.sub(r'``+', '', text)
+        text = re.sub(r"''", '"', text)
+        # Remove awkward phrase markers
+        text = re.sub(r'- actually, scratch that -', '', text)
+        text = re.sub(r'- wait, let me back up -', '', text)
+        text = re.sub(r'- you know what I mean\? -', '', text)
+        text = re.sub(r'- okay, here\'s the thing -', '', text)
+        text = re.sub(r'- bear with me here -', '', text)
+        text = re.sub(r'- I\'m serious -', '', text)
+        text = re.sub(r'- or maybe I should say -', '', text)
+        text = re.sub(r'- or rather,', '', text)
+        text = re.sub(r'- think about it -', '', text)
+        # Clean up multiple spaces
+        text = re.sub(r'\s+', ' ', text)
+        # Remove leading non-letter characters carefully
+        text = re.sub(r'^[^a-zA-Z_]+', '', text)
+        # If we accidentally removed too much, use original
+        if len(text) < len(original) * 0.5:
+            text = original
+        return text.strip()
+    def paraphrase_with_dipper(self, text, lex_diversity=60, order_diversity=20):
+        """Paraphrase text using Dipper model with sentence-level processing"""
+        if not text or len(text.strip()) < 3:
+            return text
+        # Split into sentences for better control
+        sentences = self.split_into_sentences_advanced(text)
+        paraphrased_sentences = []
+        # Track sentence patterns to avoid repetition
+        sentence_starts = []
+        for i, sentence in enumerate(sentences):
+            if len(sentence.strip()) < 3:
+                paraphrased_sentences.append(sentence)
+                continue
+            try:
+                # BALANCED diversity for Originality AI (100% human with better quality)
+                if len(sentence.split()) < 10:
+                    lex_diversity = 70  # High but not extreme
+                    order_diversity = 25
+                else:
+                    lex_diversity = 82  # Balanced diversity
+                    order_diversity = 30  # Moderate order diversity
+                lex_code = int(100 - lex_diversity)
+                order_code = int(100 - order_diversity)
+                # Format input for Dipper
+                if self.is_dipper:
+                    input_text = f"lexical = {lex_code}, order = {order_code} <sent> {sentence} </sent>"
+                else:
+                    input_text = f"paraphrase: {sentence}"
+                # Tokenize
+                inputs = self.tokenizer(
+                    input_text,
+                    return_tensors="pt",
+                    max_length=512,
+                    truncation=True,
+                    padding=True
+                )
+                # Move to device
+                if hasattr(self.model, 'device_map') and self.model.device_map:
+                    device = next(iter(self.model.device_map.values()))
+                    inputs = {k: v.to(device) for k, v in inputs.items()}
+                else:
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                # Generate with appropriate variation
+                original_length = len(sentence.split())
+                max_new_length = int(original_length * 1.4)
+                # High variation parameters
+                temp = 0.85  # Slightly reduced from 0.9
+                top_p_val = 0.92  # Slightly reduced from 0.95
+                with torch.no_grad():
+                    outputs = self.model.generate(
+                        **inputs,
+                        max_length=max_new_length + 20,
+                        min_length=max(5, int(original_length * 0.7)),
+                        do_sample=True,
+                        top_p=top_p_val,
+                        temperature=temp,
+                        no_repeat_ngram_size=4,  # Allow more repetition for naturalness
+                        num_beams=1,  # Greedy for more randomness
+                        early_stopping=True
+                    )
+                # Decode
+                paraphrased = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Clean model artifacts
+                paraphrased = self.clean_model_output_enhanced(paraphrased)
+                # Fix incomplete sentences
+                paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence)
+                # Ensure variety in sentence starts
+                first_words = paraphrased.split()[:2] if paraphrased.split() else []
+                if first_words and i > 0:
+                    # Check if we're repeating patterns
+                    first_phrase = ' '.join(first_words).lower()
+                    if sentence_starts.count(first_phrase) >= 2:
+                        # Try to rephrase the beginning
+                        paraphrased = self.vary_sentence_start(paraphrased)
+                    sentence_starts.append(first_phrase)
+                # Ensure reasonable length
+                if len(paraphrased.split()) > max_new_length:
+                    paraphrased = ' '.join(paraphrased.split()[:max_new_length])
+                paraphrased_sentences.append(paraphrased)
+            except Exception as e:
+                print(f"Error paraphrasing sentence: {str(e)}")
+                paraphrased_sentences.append(sentence)
+        # Join sentences back
+        result = ' '.join(paraphrased_sentences)
+        # Apply natural human patterns
+        result = self.add_natural_human_patterns(result)
+        return result
+    def fix_incomplete_sentence_smart(self, generated, original):
+        """Smarter sentence completion that maintains natural flow"""
+        if not generated or not generated.strip():
+            return original
+        generated = generated.strip()
+        # Check if the sentence seems complete semantically
+        words = generated.split()
+        if len(words) >= 3:
+            # Check if last word is a good ending word
+            last_word = words[-1].lower().rstrip('.,!?;:')
+            # Common ending words that might not need punctuation fix
+            ending_words = {
+                'too', 'also', 'well', 'though', 'however',
+                'furthermore', 'moreover', 'indeed', 'anyway',
+                'regardless', 'nonetheless', 'therefore', 'thus'
+            }
+            # If it ends with a good word, just add appropriate punctuation
+            if last_word in ending_words:
+                if generated[-1] not in '.!?':
+                    generated += '.'
+                return generated
+        # Check for cut-off patterns
+        if len(words) > 0:
+            last_word = words[-1]
+            # Remove if it's clearly cut off (1-2 chars, no vowels)
+            # But don't remove valid short words like "is", "of", "to", etc.
+            short_valid_words = {'is', 'of', 'to', 'in', 'on', 'at', 'by', 'or', 'if', 'so', 'up', 'no', 'we', 'he', 'me', 'be', 'do', 'go'}
+            if (len(last_word) <= 2 and
+                last_word.lower() not in short_valid_words and
+                not any(c in 'aeiouAEIOU' for c in last_word)):
+                words = words[:-1]
+                generated = ' '.join(words)
+        # Add ending punctuation based on context
+        if generated and generated[-1] not in '.!?:,;':
+            # Check original ending
+            orig_stripped = original.strip()
+            if orig_stripped.endswith('?'):
+                # Check if generated seems like a question
+                question_words = ['what', 'why', 'how', 'when', 'where', 'who', 'which', 'is', 'are', 'do', 'does', 'can', 'could', 'would', 'should']
+                first_word = generated.split()[0].lower() if generated.split() else ''
+                if first_word in question_words:
+                    generated += '?'
+                else:
+                    generated += '.'
+            elif orig_stripped.endswith('!'):
+                # Check if generated seems exclamatory
+                exclaim_words = ['amazing', 'incredible', 'fantastic', 'terrible', 'awful', 'wonderful', 'excellent']
+                if any(word in generated.lower() for word in exclaim_words):
+                    generated += '!'
+                else:
+                    generated += '.'
+            elif orig_stripped.endswith(':'):
+                generated += ':'
+            else:
+                generated += '.'
+        # Ensure first letter is capitalized ONLY if it's sentence start
+        # Don't capitalize words like "iPhone" or "eBay"
+        if generated and generated[0].islower() and not self.is_likely_acronym_or_proper_noun(generated.split()[0]):
+            generated = generated[0].upper() + generated[1:]
+        return generated
+    def split_into_sentences_advanced(self, text):
+        """Advanced sentence splitting using spaCy or NLTK"""
+        if self.use_spacy and self.nlp:
+            doc = self.nlp(text)
+            sentences = [sent.text.strip() for sent in doc.sents]
+        else:
+            # Fallback to NLTK
+            try:
+                sentences = sent_tokenize(text)
+            except:
+                # Final fallback to regex
+                sentences = re.split(r'(?<=[.!?])\s+', text)
+        # Clean up sentences
+        return [s for s in sentences if s and len(s.strip()) > 0]
+    def paraphrase_with_bart(self, text):
+        """Additional paraphrasing with BART for more variation"""
+        if not self.use_bart or not text or len(text.strip()) < 3:
+            return text
+        try:
+            # Process in smaller chunks for BART
+            sentences = self.split_into_sentences_advanced(text)
+            paraphrased_sentences = []
+            for sentence in sentences:
+                if len(sentence.split()) < 5:
+                    paraphrased_sentences.append(sentence)
+                    continue
+                inputs = self.bart_tokenizer(
+                    sentence,
+                    return_tensors='pt',
+                    max_length=128,
+                    truncation=True
+                )
+                # Move to appropriate device
+                if hasattr(self.bart_model, 'device_map') and self.bart_model.device_map:
+                    device = next(iter(self.bart_model.device_map.values()))
+                    inputs = {k: v.to(device) for k, v in inputs.items()}
+                else:
+                    inputs = {k: v.to(self.device) for k, v in inputs.items()}
+                original_length = len(sentence.split())
+                with torch.no_grad():
+                    outputs = self.bart_model.generate(
+                        **inputs,
+                        max_length=int(original_length * 1.4) + 10,
+                        min_length=max(5, int(original_length * 0.6)),
+                        num_beams=2,
+                        temperature=1.1,  # Higher temperature
+                        do_sample=True,
+                        top_p=0.9,
+                        early_stopping=True
+                    )
+                paraphrased = self.bart_tokenizer.decode(outputs[0], skip_special_tokens=True)
+                # Fix incomplete sentences
+                paraphrased = self.fix_incomplete_sentence_smart(paraphrased, sentence)
+                paraphrased_sentences.append(paraphrased)
+            result = ' '.join(paraphrased_sentences)
+            # Apply minimal grammar fixes
+            result = self.grammar_fixer.smart_fix(result)
+            return result
+        except Exception as e:
+            print(f"Error in BART paraphrasing: {str(e)}")
+            return text
+    def apply_sentence_variation(self, text):
+        """Apply natural sentence structure variations - HUMAN-LIKE FLOW"""
+        sentences = self.split_into_sentences_advanced(text)
+        varied_sentences = []
+        # Track patterns to ensure variety
+        last_sentence_length = 0
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            words = sentence.split()
+            current_length = len(words)
+            # Natural sentence length variation
+            if last_sentence_length > 20 and current_length > 20:
+                # Break up if two long sentences in a row
+                if ',' in sentence:
+                    parts = sentence.split(',', 1)
+                    if len(parts) == 2 and len(parts[1].split()) > 8:
+                        varied_sentences.append(parts[0].strip() + '.')
+                        second_part = parts[1].strip()
+                        if second_part and second_part[0].islower():
+                            second_part = second_part[0].upper() + second_part[1:]
+                        varied_sentences.append(second_part)
+                        last_sentence_length = len(parts[1].split())
+                        continue
+            # Natural combinations for flow
+            if (i < len(sentences) - 1 and
+                current_length < 10 and
+                len(sentences[i+1].split()) < 10):
+                next_sent = sentences[i+1].strip()
+                # Only combine if it makes semantic sense
+                if next_sent and any(next_sent.lower().startswith(w) for w in ['it', 'this', 'that', 'which']):
+                    combined = sentence.rstrip('.') + ' ' + next_sent[0].lower() + next_sent[1:]
+                    varied_sentences.append(combined)
+                    sentences[i+1] = ""
+                    last_sentence_length = len(combined.split())
+                    continue
+            varied_sentences.append(sentence)
+            last_sentence_length = current_length
+        return ' '.join([s for s in varied_sentences if s])
+    def fix_punctuation(self, text):
+        """Comprehensive punctuation and formatting fixes"""
+        if not text:
+            return ""
+        # First, clean any remaining model artifacts
+        text = self.clean_model_output_enhanced(text)
+        # Fix weird symbols and characters using safe replacements
+        text = text.replace('<>', '')  # Remove empty angle brackets
+        # Normalize quotes - use replace instead of regex for problematic characters
+        text = text.replace('«', '"').replace('»', '"')
+        text = text.replace('„', '"').replace('"', '"').replace('"', '"')
+        text = text.replace(''', "'").replace(''', "'")
+        text = text.replace('–', '-').replace('—', '-')
+        # Fix colon issues
+        text = re.sub(r'\.:', ':', text)  # Remove period before colon
+        text = re.sub(r':\s*\.', ':', text)  # Remove period after colon
+        # Fix basic spacing
+        text = re.sub(r'\s+', ' ', text)  # Multiple spaces to single
+        text = re.sub(r'\s+([.,!?;:])', r'\1', text)  # Remove space before punctuation
+        text = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', text)  # Remove double punctuation
+        text = re.sub(r'([.!?])\s*\1+', r'\1', text)  # Remove repeated punctuation
+        # Fix colons
+        text = re.sub(r':\s*([.,!?])', ':', text)  # Remove punctuation after colon
+        text = re.sub(r'([.,!?])\s*:', ':', text)  # Remove punctuation before colon
+        text = re.sub(r':+', ':', text)  # Multiple colons to one
+        # Fix quotes and parentheses
+        text = re.sub(r'"\s*([^"]*?)\s*"', r'"\1"', text)
+        text = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", text)
+        text = re.sub(r'\(\s*([^)]*?)\s*\)', r'(\1)', text)
+        # Fix sentence capitalization more carefully
+        # Split on ACTUAL sentence endings only
+        sentences = re.split(r'(?<=[.!?])\s+', text)
+        fixed_sentences = []
+        for i, sentence in enumerate(sentences):
+            if not sentence:
+                continue
+            # Only capitalize the first letter if it's actually lowercase
+            # and not part of a special case (like iPhone, eBay, etc.)
+            words = sentence.split()
+            if words:
+                first_word = words[0]
+                # Check if it's not an acronym or proper noun that should stay lowercase
+                if (first_word[0].islower() and
+                    not self.is_likely_acronym_or_proper_noun(first_word)):
+                    # Only capitalize if it's a regular word
+                    sentence = first_word[0].upper() + first_word[1:] + ' ' + ' '.join(words[1:])
+            fixed_sentences.append(sentence)
+        text = ' '.join(fixed_sentences)
+        # Fix common issues
+        text = re.sub(r'\bi\b', 'I', text)  # Capitalize 'I'
+        text = re.sub(r'\.{2,}', '.', text)  # Multiple periods to one
+        text = re.sub(r',{2,}', ',', text)  # Multiple commas to one
+        text = re.sub(r'\s*,\s*,\s*', ', ', text)  # Double commas with spaces
+        # Remove weird artifacts
+        text = re.sub(r'\b(CHAPTER\s+[IVX]+|SECTION\s+\d+)\b[^\w]*', '', text, flags=re.IGNORECASE)
+        # Fix abbreviations
+        text = re.sub(r'\betc\s*\.\s*\.', 'etc.', text)
+        text = re.sub(r'\be\.g\s*\.\s*[,\s]', 'e.g., ', text)
+        text = re.sub(r'\bi\.e\s*\.\s*[,\s]', 'i.e., ', text)
+        # Fix numbers with periods (like "1. " at start of lists)
+        text = re.sub(r'(\d+)\.\s+', r'\1. ', text)
+        # Fix bold/strong tags punctuation
+        text = self.fix_bold_punctuation(text)
+        # Clean up any remaining issues
+        text = re.sub(r'\s+([.,!?;:])', r'\1', text)  # Final space cleanup
+        text = re.sub(r'([.,!?;:])\s{2,}', r'\1 ', text)  # Fix multiple spaces after punctuation
+        # Ensure ending punctuation
+        text = text.strip()
+        if text and text[-1] not in '.!?':
+            # Don't add period if it ends with colon (likely a list header)
+            if not text.endswith(':'):
+                text += '.'
+        return text
+    def fix_bold_punctuation(self, text):
+        """Fix punctuation issues around bold/strong tags"""
+        # Check if this is likely a list item with colon pattern
+        def is_list_item_with_colon(text):
+            # Pattern: starts with or contains <strong>Text:</strong> or <b>Text:</b>
+            list_pattern = r'^\s*(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
+            return bool(re.search(list_pattern, text))
+        # If it's a list item with colon, preserve the format
+        if is_list_item_with_colon(text):
+            # Just clean up spacing but preserve the colon inside bold
+            text = re.sub(r'<(strong|b)>\s*([^:]+)\s*:\s*</\1>', r'<\1>\2:</\1>', text)
+            return text
+        # Pattern to find bold/strong content
+        bold_pattern = r'<(strong|b)>(.*?)</\1>'
+        def fix_bold_match(match):
+            tag = match.group(1)
+            content = match.group(2).strip()
+            if not content:
+                return f'<{tag}></{tag}>'
+            # Check if this is a list header (contains colon at the end)
+            if content.endswith(':'):
+                # Preserve list headers with colons
+                return f'<{tag}>{content}</{tag}>'
+            # Remove any periods at the start or end of bold content
+            content = content.strip('.')
+            # Check if this bold text is at the start of a sentence
+            # (preceded by nothing, or by '. ', '! ', '? ')
+            start_pos = match.start()
+            is_sentence_start = (start_pos == 0 or
+                               (start_pos > 2 and text[start_pos-2:start_pos] in ['. ', '! ', '? ', '\n\n']))
+            # Capitalize first letter if it's at sentence start
+            if is_sentence_start and content and content[0].isalpha():
+                content = content[0].upper() + content[1:]
+            return f'<{tag}>{content}</{tag}>'
+        # Fix bold/strong tags
+        text = re.sub(bold_pattern, fix_bold_match, text)
+        # Fix spacing around bold/strong tags (but not for list items)
+        if not is_list_item_with_colon(text):
+            text = re.sub(r'\.\s*<(strong|b)>', r'. <\1>', text)  # Period before bold
+            text = re.sub(r'</(strong|b)>\s*\.', r'</\1>.', text)  # Period after bold
+            text = re.sub(r'([.!?])\s*<(strong|b)>', r'\1 <\2>', text)  # Space after sentence end
+            text = re.sub(r'</(strong|b)>\s+([a-z])', lambda m: f'</{m.group(1)}> {m.group(2)}', text)  # Keep lowercase after bold if mid-sentence
+            # Remove duplicate periods around bold tags
+            text = re.sub(r'\.\s*</(strong|b)>\s*\.', r'</\1>.', text)
+            text = re.sub(r'\.\s*<(strong|b)>\s*\.', r'. <\1>', text)
+            # Fix cases where bold content ends a sentence
+            # If bold is followed by a new sentence (capital letter), add period
+            text = re.sub(r'</(strong|b)>\s+([A-Z])', r'</\1>. \2', text)
+        # Don't remove these for list items
+        if not is_list_item_with_colon(text):
+            text = re.sub(r'<(strong|b)>\s*:\s*</\1>', ':', text)  # Remove empty bold colons
+            text = re.sub(r'<(strong|b)>\s*\.\s*</\1>', '.', text)  # Remove empty bold periods
+        return text
+    def extract_text_from_html(self, html_content):
+        """Extract text elements from HTML with skip logic"""
+        soup = BeautifulSoup(html_content, 'html.parser')
+        text_elements = []
+        # Get all text nodes using string instead of text (fixing deprecation)
+        for element in soup.find_all(string=True):
+            # Skip script, style, and noscript content completely
+            if element.parent.name in ['script', 'style', 'noscript']:
+                continue
+            text = element.strip()
+            if text and not self.should_skip_element(element, text):
+                text_elements.append({
+                    'text': text,
+                    'element': element
+                })
+        return soup, text_elements
+    def validate_and_fix_html(self, html_text):
+        """Fix common HTML syntax errors after processing"""
+        # Fix DOCTYPE
+        html_text = re.sub(r'<!\s*DOCTYPE', '<!DOCTYPE', html_text, flags=re.IGNORECASE)
+        # Fix spacing issues
+        html_text = re.sub(r'>\s+<', '><', html_text)  # Remove extra spaces between tags
+        html_text = re.sub(r'\s+>', '>', html_text)  # Remove spaces before closing >
+        html_text = re.sub(r'<\s+', '<', html_text)  # Remove spaces after opening <
+        # Fix common word errors that might occur during processing
+        html_text = html_text.replace('down loaded', 'downloaded')
+        html_text = html_text.replace('But your document', 'Your document')
+        return html_text
+    def add_natural_flow_variations(self, text):
+        """Add more natural flow and rhythm variations for Originality AI"""
+        sentences = self.split_into_sentences_advanced(text)
+        enhanced_sentences = []
+        for i, sentence in enumerate(sentences):
+            if not sentence.strip():
+                continue
+            # Add stream-of-consciousness elements (8% chance - reduced)
+            if random.random() < 0.08 and len(sentence.split()) > 10:
+                stream_elements = [
+                    " - wait, let me back up - ",
+                    " - actually, scratch that - ",
+                    " - or maybe I should say - ",
+                    " - hmm, how do I put this - ",
+                    " - okay, here's the thing - ",
+                    " - you know what I mean? - "
+                ]
+                words = sentence.split()
+                pos = random.randint(len(words)//4, 3*len(words)//4)
+                words.insert(pos, random.choice(stream_elements))
+                sentence = ' '.join(words)
+            # Add human-like self-corrections (7% chance - reduced)
+            if random.random() < 0.07:
+                corrections = [
+                    " - or rather, ",
+                    " - well, actually, ",
+                    " - I mean, ",
+                    " - or should I say, ",
+                    " - correction: "
+                ]
+                words = sentence.split()
+                if len(words) > 8:
+                    pos = random.randint(len(words)//2, len(words)-3)
+                    correction = random.choice(corrections)
+                    # Repeat a concept with variation
+                    repeated_word_idx = random.randint(max(0, pos-5), pos-1)
+                    if repeated_word_idx < len(words):
+                        words.insert(pos, correction)
+                sentence = ' '.join(words)
+            # Add thinking-out-loud patterns (10% chance - reduced)
+            if random.random() < 0.10 and i > 0:
+                thinking_patterns = [
+                    "Come to think of it, ",
+                    "Actually, you know what? ",
+                    "Wait, here's a thought: ",
+                    "Oh, and another thing - ",
+                    "Speaking of which, ",
+                    "This reminds me, ",
+                    "Now that I mention it, ",
+                    "Funny you should ask, because "
+                ]
+                pattern = random.choice(thinking_patterns)
+                sentence = pattern + sentence[0].lower() + sentence[1:] if len(sentence) > 1 else sentence
+            enhanced_sentences.append(sentence)
+        return ' '.join(enhanced_sentences)
+    def process_html(self, html_content, progress_callback=None):
+        """Main processing function with progress callback"""
+        if not html_content.strip():
+            return "Please provide HTML content."
+        # Store all script and style content to preserve it
+        script_placeholder = "###SCRIPT_PLACEHOLDER_{}###"
+        style_placeholder = "###STYLE_PLACEHOLDER_{}###"
+        preserved_scripts = []
+        preserved_styles = []
+        # Temporarily replace script and style tags with placeholders
+        soup_temp = BeautifulSoup(html_content, 'html.parser')
+        # Preserve all script tags
+        for idx, script in enumerate(soup_temp.find_all('script')):
+            placeholder = script_placeholder.format(idx)
+            preserved_scripts.append(str(script))
+            script.replace_with(placeholder)
+        # Preserve all style tags
+        for idx, style in enumerate(soup_temp.find_all('style')):
+            placeholder = style_placeholder.format(idx)
+            preserved_styles.append(str(style))
+            style.replace_with(placeholder)
+        # Get the modified HTML
+        html_content = str(soup_temp)
+        try:
+            # Extract text elements
+            soup, text_elements = self.extract_text_from_html(html_content)
+            total_elements = len(text_elements)
+            print(f"Found {total_elements} text elements to process (after filtering)")
+            # Process each text element
+            processed_count = 0
+            for i, element_info in enumerate(text_elements):
+                original_text = element_info['text']
+                # Skip placeholders
+                if "###SCRIPT_PLACEHOLDER_" in original_text or "###STYLE_PLACEHOLDER_" in original_text:
+                    continue
+                # Skip very short texts
+                if len(original_text.split()) < 3:
+                    continue
+                # First pass with Dipper
+                paraphrased_text = self.paraphrase_with_dipper(
+                    original_text,
+                    lex_diversity=60,
+                    order_diversity=20
+                )
+                # Second pass with BART for longer texts (balanced probability)
+                if self.use_bart and len(paraphrased_text.split()) > 8:
+                    # 30% chance to use BART for more variation (balanced)
+                    if random.random() < 0.3:
+                        paraphrased_text = self.paraphrase_with_bart(paraphrased_text)
+                # Apply sentence variation
+                paraphrased_text = self.apply_sentence_variation(paraphrased_text)
+                # Add natural flow variations
+                paraphrased_text = self.add_natural_flow_variations(paraphrased_text)
+                # Fix punctuation and formatting
+                paraphrased_text = self.fix_punctuation(paraphrased_text)
+                # Final quality check
+                if paraphrased_text and len(paraphrased_text.split()) >= 3:
+                    element_info['element'].replace_with(NavigableString(paraphrased_text))
+                    processed_count += 1
+                # Progress update
+                if progress_callback:
+                    progress_callback(i + 1, total_elements)
+                if i % 10 == 0 or i == total_elements - 1:
+                    progress = (i + 1) / total_elements * 100
+                    print(f"Progress: {progress:.1f}%")
+            # Get the processed HTML
+            result = str(soup)
+            # Restore all script tags
+            for idx, script_content in enumerate(preserved_scripts):
+                placeholder = script_placeholder.format(idx)
+                result = result.replace(placeholder, script_content)
+            # Restore all style tags
+            for idx, style_content in enumerate(preserved_styles):
+                placeholder = style_placeholder.format(idx)
+                result = result.replace(placeholder, style_content)
+            # Post-process the entire HTML to fix bold/strong formatting
+            result = self.post_process_html(result)
+            # Validate and fix HTML syntax
+            result = self.validate_and_fix_html(result)
+            # Count skipped elements properly
+            all_text_elements = soup.find_all(string=True)
+            skipped = len([e for e in all_text_elements if e.strip() and e.parent.name not in ['script', 'style', 'noscript']]) - total_elements
+            print(f"Successfully processed {processed_count} text elements")
+            print(f"Skipped {skipped} elements (headings, CTAs, tables, testimonials, strong/bold tags, etc.)")
+            print(f"Preserved {len(preserved_scripts)} script tags and {len(preserved_styles)} style tags")
+            return result
+        except Exception as e:
+            import traceback
+            error_msg = f"Error processing HTML: {str(e)}\n{traceback.format_exc()}"
+            print(error_msg)
+            # Return original HTML with error message prepended as HTML comment
+            return f"<!-- {error_msg} -->\n{html_content}"
+    def post_process_html(self, html_text):
+        """Post-process the entire HTML to fix formatting issues"""
+        # Fix empty angle brackets that might appear
+        html_text = re.sub(r'<>\s*([^<>]+?)\s*(?=\.|\s|<)', r'\1', html_text)  # Remove <> around text
+        html_text = re.sub(r'<>', '', html_text)  # Remove any remaining empty <>
+        # Fix double angle brackets around bold tags
+        html_text = re.sub(r'<<b>>', '<b>', html_text)
+        html_text = re.sub(r'<</b>>', '</b>', html_text)
+        html_text = re.sub(r'<<strong>>', '<strong>', html_text)
+        html_text = re.sub(r'<</strong>>', '</strong>', html_text)
+        # Fix periods around bold/strong tags
+        html_text = re.sub(r'\.\s*<(b|strong)>', '. <\1>', html_text)  # Period before bold
+        html_text = re.sub(r'</(b|strong)>\s*\.', '</\1>.', html_text)  # Period after bold
+        html_text = re.sub(r'\.<<(b|strong)>>', '. <\1>', html_text)  # Fix double bracket cases
+        html_text = re.sub(r'</(b|strong)>>\.', '</\1>.', html_text)
+        # Fix periods after colons
+        html_text = re.sub(r':\s*\.', ':', html_text)
+        html_text = re.sub(r'\.:', ':', html_text)
+        # Check if a line is a list item
+        def process_line(line):
+            # Check if this line contains a list pattern with bold
+            list_pattern = r'(?:^|\s)(?:[-•*▪▫◦‣⁃]\s*)?<(?:strong|b)>[^<]+:</(?:strong|b)>'
+            if re.search(list_pattern, line):
+                # This is a list item, preserve the colon format
+                return line
+            # Not a list item, apply regular fixes
+            # Remove periods immediately inside bold tags
+            line = re.sub(r'<(strong|b)>\s*\.\s*([^<]+)\s*\.\s*</\1>', r'<\1>\2</\1>', line)
+            # Fix sentence endings with bold
+            line = re.sub(r'</(strong|b)>\s*([.!?])', r'</\1>\2', line)
+            return line
+        # Process line by line to preserve list formatting
+        lines = html_text.split('\n')
+        processed_lines = [process_line(line) for line in lines]
+        html_text = '\n'.join(processed_lines)
+        # Fix sentence starts with bold
+        def fix_bold_sentence_start(match):
+            pre_context = match.group(1)
+            tag = match.group(2)
+            content = match.group(3)
+            # Skip if this is part of a list item with colon
+            full_match = match.group(0)
+            if ':' in full_match and '</' + tag + '>' in full_match:
+                return full_match
+            # Check if this should start with capital
+            if pre_context == '' or pre_context.endswith(('.', '!', '?', '>')):
+                if content and content[0].islower():
+                    content = content[0].upper() + content[1:]
+            return f'{pre_context}<{tag}>{content}'
+        # Look for bold/strong tags and check their context
+        html_text = re.sub(r'(^|.*?)(<(?:strong|b)>)([a-zA-Z])', fix_bold_sentence_start, html_text)
+        # Clean up spacing around bold tags (but preserve list formatting)
+        # Split into segments to handle list items separately
+        segments = re.split(r'(<(?:strong|b)>[^<]*:</(?:strong|b)>)', html_text)
+        cleaned_segments = []
+        for i, segment in enumerate(segments):
+            if i % 2 == 1:  # This is a list item pattern
+                cleaned_segments.append(segment)
+            else:
+                # Apply spacing fixes to non-list segments
+                segment = re.sub(r'\s+<(strong|b)>', r' <\1>', segment)
+                segment = re.sub(r'</(strong|b)>\s+', r'</\1> ', segment)
+                # Fix punctuation issues
+                segment = re.sub(r'([.,!?;:])\s*([.,!?;:])', r'\1', segment)
+                # Fix periods inside/around bold
+                segment = re.sub(r'\.<(strong|b)>\.', '. <\1>', segment)
+                segment = re.sub(r'\.</(strong|b)>\.', '</\1>.', segment)
+                cleaned_segments.append(segment)
+        html_text = ''.join(cleaned_segments)
+        # Final cleanup
+        html_text = re.sub(r'\.{2,}', '.', html_text)  # Multiple periods
+        html_text = re.sub(r',{2,}', ',', html_text)  # Multiple commas
+        html_text = re.sub(r':{2,}', ':', html_text)  # Multiple colons
+        html_text = re.sub(r'\s+([.,!?;:])', r'\1', html_text)  # Space before punctuation
+        # Fix empty bold tags (but not those with just colons)
+        html_text = re.sub(r'<(strong|b)>\s*</\1>', '', html_text)
+        # Fix specific patterns in lists/stats
+        # Pattern like "5,000+" should not have period after
+        html_text = re.sub(r'(\d+[,\d]*\+?)\s*\.\s*\n', r'\1\n', html_text)
+        # Clean up any remaining double brackets
+        html_text = re.sub(r'<<', '<', html_text)
+        html_text = re.sub(r'>>', '>', html_text)
+        # Apply final minimal grammar fixes
+        html_text = self.grammar_fixer.smart_fix(html_text)
+        return html_text
+# Initialize the humanizer
+humanizer = EnhancedDipperHumanizer()
+def humanize_html(html_input, progress=gr.Progress()):
+    """Gradio interface function with progress updates"""
+    if not html_input:
+        return "Please provide HTML content to humanize."
+    progress(0, desc="Starting processing...")
+    start_time = time.time()
+    # Create a wrapper to update progress
+    def progress_callback(current, total):
+        if total > 0:
+            progress(current / total, desc=f"Processing: {current}/{total} elements")
+    # Pass progress callback to process_html
+    result = humanizer.process_html(
+        html_input,
+        progress_callback=progress_callback
+    )
+    processing_time = time.time() - start_time
+    print(f"Processing completed in {processing_time:.2f} seconds")
+    progress(1.0, desc="Complete!")
+    return result
+# Create Gradio interface with queue
+iface = gr.Interface(
+    fn=humanize_html,
+    inputs=[
+        gr.Textbox(
+            lines=10,
+            placeholder="Paste your HTML content here...",
+            label="HTML Input"
+        )
+    ],
+    outputs=gr.Textbox(
+        lines=10,
+        label="Humanized HTML Output"
+    ),
+    title="Enhanced Dipper AI Humanizer - Optimized for Originality AI",
+    description="""
+    Ultra-aggressive humanizer optimized to achieve 100% human scores on both Undetectable AI and Originality AI.
+    Key Features:
+    - Maximum diversity settings (90% lexical, 40% order) for natural variation
+    - Enhanced human patterns: personal opinions, self-corrections, thinking-out-loud
+    - Natural typos, contractions, and conversational flow
+    - Stream-of-consciousness elements and rhetorical questions
+    - Originality AI-specific optimizations: varied sentence starters, emphatic repetitions
+    - Skips content in <strong>, <b>, and heading tags (including inside tables)
+    - Designed to pass the strictest AI detection systems
+    The tool creates genuinely human-like writing patterns that fool even the most sophisticated detectors!
+    ⚠️ Note: Processing may take 5-10 minutes for large HTML documents.
+    """,
+    examples=[
+        ["""<article>
+<h1>The Benefits of Regular Exercise</h1>
+<div class="author-intro">By John Doe, Fitness Expert | 10 years experience</div>
+<p>Regular exercise is essential for maintaining good health. It helps improve cardiovascular fitness, strengthens muscles, and enhances mental well-being. Studies have shown that people who exercise regularly have lower risks of chronic diseases.</p>
+<p>Additionally, exercise can boost mood and energy levels. It releases endorphins, which are natural mood elevators. Even moderate activities like walking can make a significant difference in overall health.</p>
+</article>"""]
     ],
+    theme="default"
 )
 if __name__ == "__main__":
+    # Enable queue for better handling of long-running processes
+    iface.queue(max_size=10)
+    iface.launch(share=True)