Print Page - Seinfeld Markov Chain Chatbot

Title: Seinfeld Markov Chain Chatbot
Post by: ron77 on Apr 03, 2025, 04:14 PM
#!/usr/bin/env python3
"""
Seinfeld Markov Chain Chatbot
A Python implementation of a Markov chain chatbot trained on Seinfeld dialogue.
"""

import random
import re
import pickle
import os
from collections import defaultdict, Counter
import string

class MarkovModel:
    """Markov chain model for generating text based on trained data."""
    
    def __init__(self):
        self.tokens = []  # List of all tokens
        self.token_to_idx = {}  # Mapping from token to its index
        self.transitions = []  # List of transitions for each token
        self.model_trained = False
    
    def add_token(self, token):
        """Add a token to the model if it doesn't exist and return its index."""
        if token in self.token_to_idx:
            return self.token_to_idx[token]
        
        token_idx = len(self.tokens)
        self.tokens.append(token)
        self.token_to_idx[token] = token_idx
        self.transitions.append([])
        return token_idx
    
    def add_transition(self, from_token_idx, to_token_idx):
        """Add a transition from one token to another."""
        self.transitions[from_token_idx].append(to_token_idx)
    
    def get_next_token(self, token_idx):
        """Get a random next token based on transitions."""
        if not self.transitions[token_idx]:
            return -1
        
        return random.choice(self.transitions[token_idx])
    
    def find_token(self, search_token):
        """Find a token in the model with various search strategies."""
        # Strategy 1: Direct lookup
        if search_token in self.token_to_idx:
            return self.token_to_idx[search_token]
        
        # Strategy 2: Case insensitive lookup
        search_token_lower = search_token.lower()
        for token, idx in self.token_to_idx.items():
            if token.lower() == search_token_lower:
                return idx
        
        # Strategy 3: Substring match
        for token, idx in self.token_to_idx.items():
            if search_token_lower in token.lower():
                return idx
        
        # Strategy 4: First word match for multi-word tokens
        if ' ' in search_token:
            first_word = search_token.split()[0].lower()
            for token, idx in self.token_to_idx.items():
                if token.lower().startswith(first_word):
                    return idx
        
        # Strategy 5: Return random token with many transitions
        candidates = []
        for idx, transitions in enumerate(self.transitions):
            if len(transitions) > 5:  # Token has reasonable number of transitions
                token = self.tokens[idx]
                # Skip tokens with script artifacts
                if '(' not in token and ')' not in token and ':' not in token:
                    candidates.append((idx, len(transitions)))
        
        if candidates:
            # Sort by transition count (descending) and take top 10
            candidates.sort(key=lambda x: x[1], reverse=True)
            top_candidates = candidates[:10]
            return random.choice(top_candidates)[0]
        
        # Fallback to random token
        return random.randint(0, len(self.tokens) - 1) if self.tokens else -1


class SeinfeldChatbot:
    """Chatbot that responds with Seinfeld-style dialogue using a Markov chain."""
    
    def __init__(self):
        self.model = MarkovModel()
        self.seinfeld_characters = ["JERRY", "GEORGE", "ELAINE", "KRAMER", "NEWMAN"]
        self.min_response_length = 5  # Minimum words in a response
        self.max_response_attempts = 20  # Maximum attempts to generate a response
        self.stopwords = {
            "a", "an", "the", "and", "but", "or", "for", "nor", "on", "at", "to", "from",
            "by", "with", "in", "out", "over", "under", "again", "further", "then",
            "once", "here", "there", "when", "where", "why", "how", "all", "any",
            "both", "each", "few", "more", "most", "other", "some", "such", "no",
            "not", "only", "own", "same", "so", "than", "too", "very", "s", "t",
            "can", "will", "just", "don", "should", "now", "d", "ll", "m", "o",
            "re", "ve", "y", "ain", "aren", "couldn", "didn", "doesn", "hadn",
            "hasn", "haven", "isn", "ma", "mightn", "mustn", "needn", "shan",
            "shouldn", "wasn", "weren", "won", "wouldn", "i", "me", "my", "myself",
            "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself",
            "yourselves", "he", "him", "his", "himself", "she", "her", "hers",
            "herself", "it", "its", "itself", "they", "them", "their", "theirs"
        }
        self.debug_mode = False
        self.used_seeds = set()  # Track used seed phrases to avoid repetition
    
    def clean_text(self, text):
        """Clean up text by removing script artifacts and extra whitespace."""
        # Remove stage directions
        text = re.sub(r'\([^)]*\)', '', text)
        
        # Remove character names at start of lines (e.g., "JERRY: ")
        text = re.sub(r'^[A-Z]+:', '', text)
        
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text).strip()
        
        return text
    
    def tokenize(self, text):
        """Split text into tokens."""
        return text.split()
    
    def remove_punctuation(self, word):
        """Remove punctuation from a word."""
        return word.translate(str.maketrans('', '', string.punctuation))
    
    def train_on_seinfeld_data(self, filename):
        """Train the model on Seinfeld dialogue."""
        print(f"Training on Seinfeld data from {filename}...")
        
        try:
            with open(filename, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read()
            
            # Break into lines and process line by line
            lines = content.split('\n')
            dialog_lines = []
            
            for line in lines:
                line = line.strip()
                
                # Skip empty lines and scene markers
                if not line or line.startswith('%'):
                    continue
                
                # Check if this is a character's dialogue
                for character in self.seinfeld_characters:
                    if line.startswith(character + ':'):
                        # Extract the dialogue part (after the character name)
                        dialogue = line[len(character) + 1:].strip()
                        dialogue = self.clean_text(dialogue)
                        if dialogue:
                            dialog_lines.append(dialogue)
                        break
            
            # Process the dialogue for training
            token_count = 0
            prev_token_idx = -1
            
            for line in dialog_lines:
                tokens = self.tokenize(line)
                
                for token in tokens:
                    token = token.strip()
                    if not token:
                        continue
                    
                    token_idx = self.model.add_token(token)
                    token_count += 1
                    
                    # Add transition from previous token
                    if prev_token_idx >= 0:
                        self.model.add_transition(prev_token_idx, token_idx)
                    
                    prev_token_idx = token_idx
            
            self.model.model_trained = True
            print(f"Training complete! Processed {token_count} tokens from {len(dialog_lines)} dialogue lines.")
            return True
            
        except Exception as e:
            print(f"Error training model: {e}")
            return False
    
    def save_model(self, filename):
        """Save the trained model to a file."""
        if not self.model.model_trained:
            print("No trained model to save!")
            return False
        
        try:
            with open(filename, 'wb') as f:
                pickle.dump(self.model, f)
            print(f"Model saved to {filename}")
            return True
        except Exception as e:
            print(f"Error saving model: {e}")
            return False
    
    def load_model(self, filename):
        """Load a previously trained model from a file."""
        try:
            with open(filename, 'rb') as f:
                self.model = pickle.load(f)
            print(f"Model loaded from {filename}")
            return True
        except Exception as e:
            print(f"Error loading model: {e}")
            return False
    
    def find_good_seeds(self, user_input):
        """Find good seed phrases from user input."""
        words = user_input.split()
        good_seeds = []
        
        # Remove stopwords
        important_words = [w for w in words if w.lower() not in self.stopwords 
                          and len(w) > 2  # Skip very short words
                          and not w.isdigit()]  # Skip numbers
        
        # Try various seed strategies
        
        # 1. Two-word combinations of important words
        if len(important_words) >= 2:
            for i in range(len(important_words) - 1):
                seed = f"{important_words[i]} {important_words[i+1]}"
                token_idx = self.model.find_token(seed)
                if token_idx >= 0 and seed not in self.used_seeds:
                    good_seeds.append((seed, 3))  # Higher score for two-word matches
        
        # 2. Individual important words
        for word in important_words:
            token_idx = self.model.find_token(word)
            if token_idx >= 0 and word not in self.used_seeds:
                good_seeds.append((word, 2))
        
        # 3. Original phrases from input (up to 3 words)
        for i in range(len(words)):
            for j in range(1, min(4, len(words) - i + 1)):
                phrase = " ".join(words[i:i+j])
                token_idx = self.model.find_token(phrase)
                if token_idx >= 0 and phrase not in self.used_seeds:
                    good_seeds.append((phrase, j))  # Score based on length
        
        # Sort by score (descending)
        good_seeds.sort(key=lambda x: x[1], reverse=True)
        
        # Return just the seeds (without scores)
        return [seed for seed, _ in good_seeds]
    
    def generate_sentence(self, seed_phrase):
        """Generate a sentence starting with the given seed phrase."""
        if not seed_phrase or not self.model.model_trained:
            return ""
        
        max_length = 50  # Maximum number of tokens in a sentence
        max_attempts = 30  # Maximum attempts to generate a valid token
        
        # Start with the seed phrase
        token_idx = self.model.find_token(seed_phrase)
        if token_idx < 0:
            return ""
        
        current_token = self.model.tokens[token_idx]
        sentence = current_token
        used_tokens = {current_token}
        word_count = len(seed_phrase.split())
        
        sentence_ended = False
        attempts = 0
        
        while word_count < max_length and attempts < max_attempts and not sentence_ended:
            # Find the next token
            next_token_idx = self.model.get_next_token(token_idx)
            
            if next_token_idx >= 0 and next_token_idx < len(self.model.tokens):
                next_token = self.model.tokens[next_token_idx]
                
                # Check for repetition
                if next_token not in used_tokens:
                    sentence += " " + next_token
                    used_tokens.add(next_token)
                    word_count += 1
                    token_idx = next_token_idx
                    
                    # Check if this ended a sentence
                    if next_token and next_token[-1] in ['.', '!', '?']:
                        sentence_ended = True
                    
                    # If we've reached a reasonable length, consider stopping
                    if word_count >= 8:  # At least 8 words for a decent sentence
                        break
                else:
                    attempts += 1
            else:
                attempts += 1
        
        # Ensure sentence ends with proper punctuation
        if sentence and sentence[-1] not in ['.', '!', '?']:
            sentence += "."
        
        # Final cleanup
        sentence = sentence.strip()
        sentence = re.sub(r'\s+', ' ', sentence)  # Remove extra spaces
        
        # Filter out sentences that are too short
        if len(sentence.split()) < 5:
            return ""
        
        # Capitalize first letter
        if sentence:
            sentence = sentence[0].upper() + sentence[1:]
        
        return sentence
    
    def generate_fallback_response(self):
        """Generate a fallback Seinfeld-style response when normal generation fails."""
        responses = [
            "What's the deal with that?",
            "Not that there's anything wrong with that.",
            "These pretzels are making me thirsty!",
            "I'm out there, Jerry, and I'm loving every minute of it!",
            "No soup for you!",
            "Serenity now!",
            "I don't wanna be a pirate!",
            "It's not a lie if you believe it.",
            "You know, we're living in a society!",
            "I'm speechless. I have no speech.",
            "You want a piece of me? YOU GOT IT!",
            "Hello, Newman.",
            "That's a shame.",
            "I've yada yada'd sex.",
            "Maybe the dingo ate your baby!",
            "I choose not to run!",
            "It's not you, it's me.",
            "You can stuff your sorries in a sack, mister!",
            "I'm a joke maker. Tell him, Jerry.",
            "And you want to be my latex salesman..."
        ]
        return random.choice(responses)
    
    def generate_response(self, user_input):
        """Generate a response to user input."""
        if not user_input or not self.model.model_trained:
            return "The chatbot hasn't been trained yet!"
        
        # Clean and normalize input
        user_input = self.clean_text(user_input)
        
        # Find good seed phrases from the input
        seed_phrases = self.find_good_seeds(user_input)
        
        # Try to generate responses with each seed phrase
        responses = []
        
        for seed in seed_phrases[:5]:  # Try top 5 seeds
            for _ in range(self.max_response_attempts // len(seed_phrases)):
                response = self.generate_sentence(seed)
                if response and len(response.split()) >= self.min_response_length:
                    responses.append(response)
            
            # Mark this seed as used to prevent repetition
            self.used_seeds.add(seed)
            
            # Keep used_seeds from growing too large
            if len(self.used_seeds) > 100:
                self.used_seeds = set(list(self.used_seeds)[-50:])
        
        # If we generated valid responses, choose the best one
        if responses:
            # Prioritize longer responses
            responses.sort(key=lambda x: len(x.split()), reverse=True)
            
            # Randomly select one of the top responses
            return random.choice(responses[:3])
        
        # If all else fails, return a fallback response
        return self.generate_fallback_response()
    
    def get_character_name(self):
        """Get a random Seinfeld character name for the response."""
        top_characters = ["Jerry", "George"]
        return random.choice(top_characters)


def main():
    """Main function to run the chatbot."""
    chatbot = SeinfeldChatbot()
    
    print("=" * 60)
    print("  Seinfeld Markov Chain Chatbot")
    print("=" * 60)
    
    while True:
        print("\nOptions:")
        print("1. Train on Seinfeld dialogue")
        print("2. Load existing model")
        print("3. Chat with the bot")
        print("4. Save the current model")
        print("5. Exit")
        
        choice = input("\nEnter your choice (1-5): ")
        
        if choice == '1':
            filename = input("Enter the Seinfeld dialogue file path (default: seinfeld_complete_dialogue.txt): ")
            if not filename:
                filename = "seinfeld_complete_dialogue.txt"
            
            if not os.path.exists(filename):
                print(f"File {filename} not found!")
                continue
            
            chatbot.train_on_seinfeld_data(filename)
            
        elif choice == '2':
            filename = input("Enter the model file path (default: seinfeld_model.pkl): ")
            if not filename:
                filename = "seinfeld_model.pkl"
            
            if not os.path.exists(filename):
                print(f"File {filename} not found!")
                continue
            
            chatbot.load_model(filename)
            
        elif choice == '3':
            if not chatbot.model.model_trained:
                print("Please train or load a model first!")
                continue
            
            print("\n===== Seinfeld Chat Mode =====")
            print("(Type 'exit' to return to the main menu)")
            
            while True:
                user_input = input("\nYou: ")
                
                if user_input.lower() in ['exit', 'quit', 'bye']:
                    break
                
                character = chatbot.get_character_name()
                response = chatbot.generate_response(user_input)
                print(f"\n{character}: {response}")
                
        elif choice == '4':
            if not chatbot.model.model_trained:
                print("No trained model to save!")
                continue
            
            filename = input("Enter filename to save the model (default: seinfeld_model.pkl): ")
            if not filename:
                filename = "seinfeld_model.pkl"
            
            chatbot.save_model(filename)
            
        elif choice == '5':
            print("Goodbye!")
            break
            
        else:
            print("Invalid choice, please try again.")


if __name__ == "__main__":
    main()
seinfeld_complete_dialogue.txt
RetroCoders Community

Other Languages Programming => Python => Topic started by: ron77 on Apr 03, 2025, 04:14 PM