Module `loda.llm`

Large Language Model (LLM) implementation for natural language to LODA code generation.

This module provides functionality to train transformer-based models that can understand natural language descriptions of integer sequences (like OEIS sequences) and generate corresponding LODA assembly programs.

Key components: - Data preprocessing for OEIS sequence descriptions and LODA programs - Transformer-based encoder-decoder architecture - Training pipeline with proper tokenization - Inference utilities for code generation - Evaluation metrics for generated programs

Example usage:

>>> from loda.llm import LodaT5Model, LodaGenerator, train_loda_llm
>>> 
>>> # Train a model
>>> model = train_loda_llm("programs/oeis", "trained_model")
>>> 
>>> # Generate code
>>> generator = LodaGenerator(model)
>>> results = generator.generate("Fibonacci numbers")
>>> print(results[0].generated_code)

Sub-modules

loda.llm.data_preprocessing: Data preprocessing utilities for LLM training on OEIS sequences and LODA programs …
loda.llm.inference: Inference and evaluation utilities for the LODA LLM …
loda.llm.model: Transformer-based model for natural language to LODA code generation …
loda.llm.trainer: Training script for the LODA LLM (Large Language Model) …

Functions

def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True)

Expand source code

def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True):
    """
    Convenience function to create and save a training dataset.
    
    Args:
        programs_dir: Path to OEIS programs directory
        output_file: Path to save the dataset
        max_examples: Maximum number of examples (-1 for all)
        augment: Whether to augment with description variations
    """
    preprocessor = DataPreprocessor(programs_dir)
    examples = preprocessor.create_training_examples(max_examples)
    
    if augment:
        examples = preprocessor.augment_descriptions(examples)
    
    preprocessor.save_dataset(examples, output_file)
    return examples

Convenience function to create and save a training dataset.

Args

programs_dir: Path to OEIS programs directory
output_file: Path to save the dataset
max_examples: Maximum number of examples (-1 for all)
augment: Whether to augment with description variations

def train_loda_llm(programs_dir: str, output_dir: str = 'loda_llm_model', model_name: str = 't5-small', max_examples: int = -1, val_split: float = 0.1, batch_size: int = 8, learning_rate: float = 5e-05, num_epochs: int = 3)

Expand source code

def train_loda_llm(programs_dir: str,
                   output_dir: str = "loda_llm_model",
                   model_name: str = "t5-small",
                   max_examples: int = -1,
                   val_split: float = 0.1,
                   batch_size: int = 8,
                   learning_rate: float = 5e-5,
                   num_epochs: int = 3):
    """
    Main training function.
    
    Args:
        programs_dir: Directory containing OEIS programs
        output_dir: Directory to save the trained model
        model_name: Base T5 model to use
        max_examples: Maximum number of training examples (-1 for all)
        val_split: Fraction of data to use for validation
        batch_size: Training batch size
        learning_rate: Learning rate
        num_epochs: Number of training epochs
    """
    print("Preparing training data...")
    
    # Create training examples
    preprocessor = DataPreprocessor(programs_dir)
    examples = preprocessor.create_training_examples(max_examples)
    
    if len(examples) == 0:
        print("No training examples found!")
        return None
    
    # Augment examples
    print("Augmenting training examples...")
    examples = preprocessor.augment_descriptions(examples)
    
    # Split into train/validation
    if val_split > 0:
        split_idx = int(len(examples) * (1 - val_split))
        train_examples = examples[:split_idx]
        val_examples = examples[split_idx:]
    else:
        train_examples = examples
        val_examples = None
    
    print(f"Training examples: {len(train_examples)}")
    if val_examples:
        print(f"Validation examples: {len(val_examples)}")
    
    # Create model
    print(f"Creating model based on {model_name}...")
    model = LodaT5Model(model_name)
    
    # Create datasets
    train_dataset = LodaDataset(train_examples, model)
    val_dataset = LodaDataset(val_examples, model) if val_examples else None
    
    # Create trainer
    trainer = LodaTrainer(
        model=model,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        learning_rate=learning_rate,
        batch_size=batch_size,
        num_epochs=num_epochs,
        save_dir=output_dir
    )
    
    # Train the model
    trained_model = trainer.train()
    
    # Save final model
    trained_model.save_model(output_dir)
    print(f"Final model saved to {output_dir}")
    
    return trained_model

Main training function.

Args

programs_dir: Directory containing OEIS programs
output_dir: Directory to save the trained model
model_name: Base T5 model to use
max_examples: Maximum number of training examples (-1 for all)
val_split: Fraction of data to use for validation
batch_size: Training batch size
learning_rate: Learning rate
num_epochs: Number of training epochs

Classes

class DataPreprocessor (programs_dir: str)

Expand source code

class DataPreprocessor:
    """Handles preprocessing of OEIS programs for LLM training."""
    
    def __init__(self, programs_dir: str):
        """Initialize with path to OEIS programs directory."""
        self.programs_dir = programs_dir
        self.program_cache = ProgramCache(programs_dir)
        
    def extract_description_from_program(self, program_text: str) -> Optional[str]:
        """
        Extract the natural language description from a LODA program.
        
        LODA programs typically start with comments like:
        ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            The description string or None if no description found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for OEIS description lines (start with ; A######:)
            match = re.match(r';\s*A\d{6}:\s*(.+)', line)
            if match:
                description = match.group(1).strip()
                # Clean up common artifacts
                description = description.rstrip('.')
                # Remove mathematical notation that might be confusing
                # Keep it simple for initial training
                return description
                
        return None
    
    def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
        """
        Extract the sequence terms from a LODA program comment.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            List of sequence terms or None if not found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for lines with comma-separated numbers (sequence terms)
            if line.startswith(';') and ',' in line:
                # Extract numbers from the line
                numbers_str = line[1:].strip()  # Remove the ';'
                # Skip if it looks like it contains non-numeric content
                if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                    continue
                    
                try:
                    terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                    if len(terms) >= 5:  # Reasonable number of terms
                        return terms
                except ValueError:
                    continue
                    
        return None
    
    def clean_loda_code(self, program_text: str) -> str:
        """
        Clean LODA code by removing comments and normalizing format.
        
        Args:
            program_text: Raw LODA program text
            
        Returns:
            Cleaned LODA code suitable for training
        """
        lines = program_text.strip().split('\n')
        code_lines = []
        
        for line in lines:
            # Skip comment lines (lines that start with ;)
            if line.strip().startswith(';'):
                continue
            # Skip empty lines
            if not line.strip():
                continue
            
            # Remove inline comments (everything after ; on the same line)
            if ';' in line:
                code_part = line.split(';')[0].strip()
            else:
                code_part = line.strip()
            
            # Only add non-empty code lines
            if code_part:
                code_lines.append(code_part)
        
        return '\n'.join(code_lines)
    
    def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
        """
        Create training examples from all available LODA programs.
        
        Args:
            max_examples: Maximum number of examples to create (-1 for all)
            
        Returns:
            List of TrainingExample objects
        """
        examples = []
        program_ids = self.program_cache.all_ids()
        
        if max_examples > 0:
            program_ids = program_ids[:max_examples]
        
        print(f"Processing {len(program_ids)} programs...")
        
        for i, program_id in enumerate(program_ids):
            if i % 1000 == 0:
                print(f"Processed {i}/{len(program_ids)} programs")
                
            try:
                # Read the program file
                program_path = self.program_cache.path(program_id)
                if not os.path.exists(program_path):
                    continue
                    
                with open(program_path, 'r') as f:
                    program_text = f.read()
                
                # Extract description
                description = self.extract_description_from_program(program_text)
                if not description:
                    continue
                
                # Extract terms (optional)
                terms = self.extract_terms_from_program(program_text)
                
                # Clean the LODA code
                clean_code = self.clean_loda_code(program_text)
                if not clean_code:
                    continue
                
                # Validate that the code parses correctly
                try:
                    Program(clean_code)
                except Exception:
                    continue  # Skip programs that don't parse
                
                example = TrainingExample(
                    sequence_id=str(program_id),
                    description=description,
                    loda_code=clean_code,
                    terms=terms
                )
                examples.append(example)
                
            except Exception as e:
                print(f"Error processing {program_id}: {e}")
                continue
        
        print(f"Created {len(examples)} training examples")
        return examples
    
    def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
        """
        Augment training examples with variations of descriptions.
        
        This can help make the model more robust to different phrasings.
        
        Args:
            examples: List of original training examples
            
        Returns:
            Augmented list with additional variations
        """
        augmented = list(examples)  # Start with originals
        
        for example in examples:
            desc = example.description
            
            # Create variations
            variations = []
            
            # Add "sequence of" prefix if not present
            if not desc.lower().startswith(('sequence', 'the sequence')):
                variations.append(f"Sequence of {desc.lower()}")
            
            # Add "Generate" prefix
            variations.append(f"Generate {desc.lower()}")
            
            # Add "Compute" prefix
            variations.append(f"Compute {desc.lower()}")
            
            # Remove mathematical symbols for simpler versions
            simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
            simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
            if simple_desc != desc and simple_desc:
                variations.append(simple_desc)
            
            # Create new examples for each variation
            for variation in variations:
                augmented_example = TrainingExample(
                    sequence_id=str(example.sequence_id) + "_aug",
                    description=variation,
                    loda_code=example.loda_code,
                    terms=example.terms
                )
                augmented.append(augmented_example)
        
        return augmented
    
    def save_dataset(self, examples: List[TrainingExample], output_file: str):
        """
        Save training examples to a file for later use.
        
        Args:
            examples: List of training examples
            output_file: Path to output file
        """
        import json
        
        data = []
        for example in examples:
            data.append({
                'sequence_id': example.sequence_id,
                'description': example.description,
                'loda_code': example.loda_code,
                'terms': example.terms
            })
        
        with open(output_file, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"Saved {len(examples)} examples to {output_file}")
    
    def load_dataset(self, input_file: str) -> List[TrainingExample]:
        """
        Load training examples from a file.
        
        Args:
            input_file: Path to input file
            
        Returns:
            List of TrainingExample objects
        """
        import json
        
        with open(input_file, 'r') as f:
            data = json.load(f)
        
        examples = []
        for item in data:
            example = TrainingExample(
                sequence_id=item['sequence_id'],
                description=item['description'],
                loda_code=item['loda_code'],
                terms=item.get('terms')
            )
            examples.append(example)
        
        print(f"Loaded {len(examples)} examples from {input_file}")
        return examples

Handles preprocessing of OEIS programs for LLM training.

Initialize with path to OEIS programs directory.

Methods

def augment_descriptions(self, examples: List[TrainingExample]) ‑> List[TrainingExample]

Expand source code

def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
    """
    Augment training examples with variations of descriptions.
    
    This can help make the model more robust to different phrasings.
    
    Args:
        examples: List of original training examples
        
    Returns:
        Augmented list with additional variations
    """
    augmented = list(examples)  # Start with originals
    
    for example in examples:
        desc = example.description
        
        # Create variations
        variations = []
        
        # Add "sequence of" prefix if not present
        if not desc.lower().startswith(('sequence', 'the sequence')):
            variations.append(f"Sequence of {desc.lower()}")
        
        # Add "Generate" prefix
        variations.append(f"Generate {desc.lower()}")
        
        # Add "Compute" prefix
        variations.append(f"Compute {desc.lower()}")
        
        # Remove mathematical symbols for simpler versions
        simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
        simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
        if simple_desc != desc and simple_desc:
            variations.append(simple_desc)
        
        # Create new examples for each variation
        for variation in variations:
            augmented_example = TrainingExample(
                sequence_id=str(example.sequence_id) + "_aug",
                description=variation,
                loda_code=example.loda_code,
                terms=example.terms
            )
            augmented.append(augmented_example)
    
    return augmented

Augment training examples with variations of descriptions.

This can help make the model more robust to different phrasings.

Args

examples: List of original training examples

Returns

Augmented list with additional variations

def clean_loda_code(self, program_text: str) ‑> str

Expand source code

def clean_loda_code(self, program_text: str) -> str:
    """
    Clean LODA code by removing comments and normalizing format.
    
    Args:
        program_text: Raw LODA program text
        
    Returns:
        Cleaned LODA code suitable for training
    """
    lines = program_text.strip().split('\n')
    code_lines = []
    
    for line in lines:
        # Skip comment lines (lines that start with ;)
        if line.strip().startswith(';'):
            continue
        # Skip empty lines
        if not line.strip():
            continue
        
        # Remove inline comments (everything after ; on the same line)
        if ';' in line:
            code_part = line.split(';')[0].strip()
        else:
            code_part = line.strip()
        
        # Only add non-empty code lines
        if code_part:
            code_lines.append(code_part)
    
    return '\n'.join(code_lines)

Clean LODA code by removing comments and normalizing format.

Args

program_text: Raw LODA program text

Returns

Cleaned LODA code suitable for training

def create_training_examples(self, max_examples: int = -1) ‑> List[TrainingExample]

Expand source code

def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
    """
    Create training examples from all available LODA programs.
    
    Args:
        max_examples: Maximum number of examples to create (-1 for all)
        
    Returns:
        List of TrainingExample objects
    """
    examples = []
    program_ids = self.program_cache.all_ids()
    
    if max_examples > 0:
        program_ids = program_ids[:max_examples]
    
    print(f"Processing {len(program_ids)} programs...")
    
    for i, program_id in enumerate(program_ids):
        if i % 1000 == 0:
            print(f"Processed {i}/{len(program_ids)} programs")
            
        try:
            # Read the program file
            program_path = self.program_cache.path(program_id)
            if not os.path.exists(program_path):
                continue
                
            with open(program_path, 'r') as f:
                program_text = f.read()
            
            # Extract description
            description = self.extract_description_from_program(program_text)
            if not description:
                continue
            
            # Extract terms (optional)
            terms = self.extract_terms_from_program(program_text)
            
            # Clean the LODA code
            clean_code = self.clean_loda_code(program_text)
            if not clean_code:
                continue
            
            # Validate that the code parses correctly
            try:
                Program(clean_code)
            except Exception:
                continue  # Skip programs that don't parse
            
            example = TrainingExample(
                sequence_id=str(program_id),
                description=description,
                loda_code=clean_code,
                terms=terms
            )
            examples.append(example)
            
        except Exception as e:
            print(f"Error processing {program_id}: {e}")
            continue
    
    print(f"Created {len(examples)} training examples")
    return examples

Create training examples from all available LODA programs.

Args

max_examples: Maximum number of examples to create (-1 for all)

Returns

List of TrainingExample objects

def extract_description_from_program(self, program_text: str) ‑> str | None

Expand source code

def extract_description_from_program(self, program_text: str) -> Optional[str]:
    """
    Extract the natural language description from a LODA program.
    
    LODA programs typically start with comments like:
    ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        The description string or None if no description found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for OEIS description lines (start with ; A######:)
        match = re.match(r';\s*A\d{6}:\s*(.+)', line)
        if match:
            description = match.group(1).strip()
            # Clean up common artifacts
            description = description.rstrip('.')
            # Remove mathematical notation that might be confusing
            # Keep it simple for initial training
            return description
            
    return None

Extract the natural language description from a LODA program.

LODA programs typically start with comments like: ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.

Args

program_text: The full LODA program as text

Returns

The description string or None if no description found

def extract_terms_from_program(self, program_text: str) ‑> List[int] | None

Expand source code

def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
    """
    Extract the sequence terms from a LODA program comment.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        List of sequence terms or None if not found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for lines with comma-separated numbers (sequence terms)
        if line.startswith(';') and ',' in line:
            # Extract numbers from the line
            numbers_str = line[1:].strip()  # Remove the ';'
            # Skip if it looks like it contains non-numeric content
            if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                continue
                
            try:
                terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                if len(terms) >= 5:  # Reasonable number of terms
                    return terms
            except ValueError:
                continue
                
    return None

Extract the sequence terms from a LODA program comment.

Args

program_text: The full LODA program as text

Returns

List of sequence terms or None if not found

def load_dataset(self, input_file: str) ‑> List[TrainingExample]

Expand source code

def load_dataset(self, input_file: str) -> List[TrainingExample]:
    """
    Load training examples from a file.
    
    Args:
        input_file: Path to input file
        
    Returns:
        List of TrainingExample objects
    """
    import json
    
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    examples = []
    for item in data:
        example = TrainingExample(
            sequence_id=item['sequence_id'],
            description=item['description'],
            loda_code=item['loda_code'],
            terms=item.get('terms')
        )
        examples.append(example)
    
    print(f"Loaded {len(examples)} examples from {input_file}")
    return examples

Load training examples from a file.

Args

input_file: Path to input file

Returns

List of TrainingExample objects

def save_dataset(self, examples: List[TrainingExample], output_file: str)

Expand source code

def save_dataset(self, examples: List[TrainingExample], output_file: str):
    """
    Save training examples to a file for later use.
    
    Args:
        examples: List of training examples
        output_file: Path to output file
    """
    import json
    
    data = []
    for example in examples:
        data.append({
            'sequence_id': example.sequence_id,
            'description': example.description,
            'loda_code': example.loda_code,
            'terms': example.terms
        })
    
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Saved {len(examples)} examples to {output_file}")

Save training examples to a file for later use.

Args

examples: List of training examples
output_file: Path to output file

class GenerationResult (description: str, generated_code: str, is_valid: bool, error_message: str | None = None, generated_sequence: List[int] | None = None, generation_time: float = 0.0)

Expand source code

@dataclass
class GenerationResult:
    """Result of code generation."""
    description: str
    generated_code: str
    is_valid: bool
    error_message: Optional[str] = None
    generated_sequence: Optional[List[int]] = None
    generation_time: float = 0.0

Result of code generation.

Instance variables

var description : str
var error_message : str | None
var generated_code : str
var generated_sequence : List[int] | None
var generation_time : float
var is_valid : bool

class LodaEvaluator (model: LodaT5Model)

Expand source code

class LodaEvaluator:
    """Evaluator for assessing model performance."""
    
    def __init__(self, model: LodaT5Model):
        """
        Initialize the evaluator.
        
        Args:
            model: Trained LodaT5Model to evaluate
        """
        self.model = model
        self.generator = LodaGenerator(model)
    
    def evaluate_examples(self, test_examples: List[TrainingExample]) -> Dict[str, float]:
        """
        Evaluate the model on test examples.
        
        Args:
            test_examples: List of test examples
            
        Returns:
            Dictionary with evaluation metrics
        """
        print(f"Evaluating on {len(test_examples)} examples...")
        
        total_examples = len(test_examples)
        valid_programs = 0
        exact_matches = 0
        sequence_matches = 0
        total_generation_time = 0
        
        results = []
        
        for i, example in enumerate(test_examples):
            if i % 10 == 0:
                print(f"Progress: {i}/{total_examples}")
            
            # Generate code
            generation_results = self.generator.generate(example.description, num_samples=1)
            
            if generation_results:
                result = generation_results[0]
                results.append(result)
                
                total_generation_time += result.generation_time
                
                if result.is_valid:
                    valid_programs += 1
                    
                    # Check for exact match
                    if self._normalize_code(result.generated_code) == self._normalize_code(example.loda_code):
                        exact_matches += 1
                    
                    # Check for sequence match (if we have expected terms)
                    if (example.terms and result.generated_sequence and 
                        len(result.generated_sequence) >= 3 and
                        result.generated_sequence[:3] == example.terms[:3]):
                        sequence_matches += 1
        
        # Calculate metrics
        metrics = {
            'total_examples': total_examples,
            'valid_program_rate': valid_programs / total_examples if total_examples > 0 else 0,
            'exact_match_rate': exact_matches / total_examples if total_examples > 0 else 0,
            'sequence_match_rate': sequence_matches / total_examples if total_examples > 0 else 0,
            'avg_generation_time': total_generation_time / total_examples if total_examples > 0 else 0,
            'valid_programs': valid_programs,
            'exact_matches': exact_matches,
            'sequence_matches': sequence_matches
        }
        
        return metrics, results
    
    def _normalize_code(self, code: str) -> str:
        """Normalize code for comparison."""
        # Remove extra whitespace and normalize format
        lines = []
        for line in code.strip().split('\n'):
            line = line.strip()
            if line:
                lines.append(line)
        return '\n'.join(lines)
    
    def print_evaluation_report(self, metrics: Dict[str, float], results: List[GenerationResult]):
        """Print a detailed evaluation report."""
        print("\n" + "="*60)
        print("LODA LLM EVALUATION REPORT")
        print("="*60)
        
        print(f"Total Examples: {metrics['total_examples']}")
        print(f"Valid Programs: {metrics['valid_programs']} ({metrics['valid_program_rate']:.1%})")
        print(f"Exact Matches: {metrics['exact_matches']} ({metrics['exact_match_rate']:.1%})")
        print(f"Sequence Matches: {metrics['sequence_matches']} ({metrics['sequence_match_rate']:.1%})")
        print(f"Avg Generation Time: {metrics['avg_generation_time']:.2f}s")
        
        # Show some example results
        print("\n" + "-"*60)
        print("SAMPLE RESULTS")
        print("-"*60)
        
        # Show successful examples
        successful = [r for r in results if r.is_valid]
        if successful:
            print("\nSuccessful generations:")
            for i, result in enumerate(successful[:3]):  # Show first 3
                print(f"\n{i+1}. Description: {result.description}")
                print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")
                if result.generated_sequence:
                    print(f"   Sequence: {result.generated_sequence}")
        
        # Show failed examples
        failed = [r for r in results if not r.is_valid]
        if failed:
            print(f"\nFailed generations ({len(failed)} total):")
            for i, result in enumerate(failed[:3]):  # Show first 3
                print(f"\n{i+1}. Description: {result.description}")
                print(f"   Error: {result.error_message}")
                print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")

Evaluator for assessing model performance.

Initialize the evaluator.

Args

model: Trained LodaT5Model to evaluate

Methods

def evaluate_examples(self, test_examples: List[TrainingExample]) ‑> Dict[str, float]

Expand source code

def evaluate_examples(self, test_examples: List[TrainingExample]) -> Dict[str, float]:
    """
    Evaluate the model on test examples.
    
    Args:
        test_examples: List of test examples
        
    Returns:
        Dictionary with evaluation metrics
    """
    print(f"Evaluating on {len(test_examples)} examples...")
    
    total_examples = len(test_examples)
    valid_programs = 0
    exact_matches = 0
    sequence_matches = 0
    total_generation_time = 0
    
    results = []
    
    for i, example in enumerate(test_examples):
        if i % 10 == 0:
            print(f"Progress: {i}/{total_examples}")
        
        # Generate code
        generation_results = self.generator.generate(example.description, num_samples=1)
        
        if generation_results:
            result = generation_results[0]
            results.append(result)
            
            total_generation_time += result.generation_time
            
            if result.is_valid:
                valid_programs += 1
                
                # Check for exact match
                if self._normalize_code(result.generated_code) == self._normalize_code(example.loda_code):
                    exact_matches += 1
                
                # Check for sequence match (if we have expected terms)
                if (example.terms and result.generated_sequence and 
                    len(result.generated_sequence) >= 3 and
                    result.generated_sequence[:3] == example.terms[:3]):
                    sequence_matches += 1
    
    # Calculate metrics
    metrics = {
        'total_examples': total_examples,
        'valid_program_rate': valid_programs / total_examples if total_examples > 0 else 0,
        'exact_match_rate': exact_matches / total_examples if total_examples > 0 else 0,
        'sequence_match_rate': sequence_matches / total_examples if total_examples > 0 else 0,
        'avg_generation_time': total_generation_time / total_examples if total_examples > 0 else 0,
        'valid_programs': valid_programs,
        'exact_matches': exact_matches,
        'sequence_matches': sequence_matches
    }
    
    return metrics, results

Evaluate the model on test examples.

Args

test_examples: List of test examples

Returns

Dictionary with evaluation metrics

def print_evaluation_report(self, metrics: Dict[str, float], results: List[GenerationResult])

Expand source code

def print_evaluation_report(self, metrics: Dict[str, float], results: List[GenerationResult]):
    """Print a detailed evaluation report."""
    print("\n" + "="*60)
    print("LODA LLM EVALUATION REPORT")
    print("="*60)
    
    print(f"Total Examples: {metrics['total_examples']}")
    print(f"Valid Programs: {metrics['valid_programs']} ({metrics['valid_program_rate']:.1%})")
    print(f"Exact Matches: {metrics['exact_matches']} ({metrics['exact_match_rate']:.1%})")
    print(f"Sequence Matches: {metrics['sequence_matches']} ({metrics['sequence_match_rate']:.1%})")
    print(f"Avg Generation Time: {metrics['avg_generation_time']:.2f}s")
    
    # Show some example results
    print("\n" + "-"*60)
    print("SAMPLE RESULTS")
    print("-"*60)
    
    # Show successful examples
    successful = [r for r in results if r.is_valid]
    if successful:
        print("\nSuccessful generations:")
        for i, result in enumerate(successful[:3]):  # Show first 3
            print(f"\n{i+1}. Description: {result.description}")
            print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")
            if result.generated_sequence:
                print(f"   Sequence: {result.generated_sequence}")
    
    # Show failed examples
    failed = [r for r in results if not r.is_valid]
    if failed:
        print(f"\nFailed generations ({len(failed)} total):")
        for i, result in enumerate(failed[:3]):  # Show first 3
            print(f"\n{i+1}. Description: {result.description}")
            print(f"   Error: {result.error_message}")
            print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")

Print a detailed evaluation report.

class LodaGenerator (model: LodaT5Model, max_length: int = 256, num_beams: int = 4)

Expand source code

class LodaGenerator:
    """Generator class for creating LODA code from natural language."""
    
    def __init__(self, model: LodaT5Model, max_length: int = 256, num_beams: int = 4):
        """
        Initialize the generator.
        
        Args:
            model: Trained LodaT5Model
            max_length: Maximum length of generated code
            num_beams: Number of beams for beam search
        """
        self.model = model
        self.max_length = max_length
        self.num_beams = num_beams
    
    def generate(self, description: str, num_samples: int = 1) -> List[GenerationResult]:
        """
        Generate LODA code from a natural language description.
        
        Args:
            description: Natural language description of the sequence
            num_samples: Number of code samples to generate
            
        Returns:
            List of GenerationResult objects
        """
        start_time = time.time()
        
        # Generate multiple samples
        descriptions = [description] * num_samples
        generated_codes = self.model.generate(
            descriptions, 
            max_length=self.max_length,
            num_beams=self.num_beams
        )
        
        generation_time = time.time() - start_time
        
        results = []
        for code in generated_codes:
            result = self._validate_and_evaluate_code(description, code)
            result.generation_time = generation_time / num_samples
            results.append(result)
        
        return results
    
    def _validate_and_evaluate_code(self, description: str, code: str) -> GenerationResult:
        """
        Validate and evaluate generated LODA code.
        
        Args:
            description: Original description
            code: Generated LODA code
            
        Returns:
            GenerationResult with validation info
        """
        result = GenerationResult(
            description=description,
            generated_code=code,
            is_valid=False
        )
        
        try:
            # Try to parse the program
            program = Program(code)
            
            # Try to evaluate it for a few terms
            interpreter = Interpreter(max_memory=100, max_stack=10, max_steps=10000)
            evaluator = Evaluator(program, interpreter)
            
            sequence_terms = []
            for i in range(10):  # Generate first 10 terms
                try:
                    term = evaluator(i)
                    sequence_terms.append(term)
                except Exception:
                    break  # Stop if evaluation fails
            
            if len(sequence_terms) >= 3:  # At least 3 terms generated
                result.is_valid = True
                result.generated_sequence = sequence_terms
            else:
                result.error_message = "Could not generate sufficient sequence terms"
        
        except Exception as e:
            result.error_message = f"Program validation failed: {str(e)}"
        
        return result
    
    def generate_interactive(self):
        """Interactive mode for generating LODA code."""
        print("LODA Code Generator - Interactive Mode")
        print("Enter natural language descriptions to generate LODA code.")
        print("Type 'quit' to exit.\n")
        
        while True:
            try:
                description = input("Description: ").strip()
                
                if description.lower() in ['quit', 'exit', 'q']:
                    print("Goodbye!")
                    break
                
                if not description:
                    continue
                
                print("Generating code...")
                results = self.generate(description, num_samples=1)
                
                for i, result in enumerate(results):
                    print(f"\n--- Result {i+1} ---")
                    print(f"Generated in {result.generation_time:.2f}s")
                    print(f"Valid: {result.is_valid}")
                    
                    if result.error_message:
                        print(f"Error: {result.error_message}")
                    
                    print("Generated LODA code:")
                    print(result.generated_code)
                    
                    if result.generated_sequence:
                        print(f"Sequence terms: {result.generated_sequence}")
                    
                    print("-" * 50)
            
            except KeyboardInterrupt:
                print("\nGoodbye!")
                break
            except Exception as e:
                print(f"Error: {e}")

Generator class for creating LODA code from natural language.

Initialize the generator.

Args

model: Trained LodaT5Model
max_length: Maximum length of generated code
num_beams: Number of beams for beam search

Methods

def generate(self, description: str, num_samples: int = 1) ‑> List[GenerationResult]

Expand source code

def generate(self, description: str, num_samples: int = 1) -> List[GenerationResult]:
    """
    Generate LODA code from a natural language description.
    
    Args:
        description: Natural language description of the sequence
        num_samples: Number of code samples to generate
        
    Returns:
        List of GenerationResult objects
    """
    start_time = time.time()
    
    # Generate multiple samples
    descriptions = [description] * num_samples
    generated_codes = self.model.generate(
        descriptions, 
        max_length=self.max_length,
        num_beams=self.num_beams
    )
    
    generation_time = time.time() - start_time
    
    results = []
    for code in generated_codes:
        result = self._validate_and_evaluate_code(description, code)
        result.generation_time = generation_time / num_samples
        results.append(result)
    
    return results

Generate LODA code from a natural language description.

Args

description: Natural language description of the sequence
num_samples: Number of code samples to generate

Returns

List of GenerationResult objects

def generate_interactive(self)

Expand source code

def generate_interactive(self):
    """Interactive mode for generating LODA code."""
    print("LODA Code Generator - Interactive Mode")
    print("Enter natural language descriptions to generate LODA code.")
    print("Type 'quit' to exit.\n")
    
    while True:
        try:
            description = input("Description: ").strip()
            
            if description.lower() in ['quit', 'exit', 'q']:
                print("Goodbye!")
                break
            
            if not description:
                continue
            
            print("Generating code...")
            results = self.generate(description, num_samples=1)
            
            for i, result in enumerate(results):
                print(f"\n--- Result {i+1} ---")
                print(f"Generated in {result.generation_time:.2f}s")
                print(f"Valid: {result.is_valid}")
                
                if result.error_message:
                    print(f"Error: {result.error_message}")
                
                print("Generated LODA code:")
                print(result.generated_code)
                
                if result.generated_sequence:
                    print(f"Sequence terms: {result.generated_sequence}")
                
                print("-" * 50)
        
        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

Interactive mode for generating LODA code.

class LodaT5Model (model_name: str = 't5-small', loda_vocab_size: int | None = None)

Expand source code

class LodaT5Model(nn.Module):
    """
    T5-based model for natural language to LODA code generation.
    """
    
    def __init__(self, model_name: str = "t5-small", loda_vocab_size: Optional[int] = None):
        """
        Initialize the model.
        
        Args:
            model_name: Base T5 model to use
            loda_vocab_size: Size of LODA vocabulary (if extending tokenizer)
        """
        super().__init__()
        
        # Load base T5 model and tokenizer
        self.text_tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        
        # Initialize LODA tokenizer
        self.loda_tokenizer = LodaTokenizer()
        
        # If we need to extend the vocabulary
        if loda_vocab_size and loda_vocab_size > self.loda_tokenizer.vocab_size:
            # Could extend vocabulary here if needed
            pass
    
    def prepare_input(self, descriptions: List[str]) -> Dict[str, torch.Tensor]:
        """
        Prepare natural language descriptions for input.
        
        Args:
            descriptions: List of natural language descriptions
            
        Returns:
            Dictionary with input tensors
        """
        # Add task prefix for T5
        prefixed_descriptions = [f"translate to loda: {desc}" for desc in descriptions]
        
        # Tokenize with T5 tokenizer
        encoded = self.text_tokenizer(
            prefixed_descriptions,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        
        return encoded
    
    def prepare_target(self, loda_codes: List[str]) -> Dict[str, torch.Tensor]:
        """
        Prepare LODA codes as targets.
        
        Args:
            loda_codes: List of LODA assembly codes
            
        Returns:
            Dictionary with target tensors
        """
        # For T5, we need to encode targets using the text tokenizer as well
        # We'll create a custom format that represents LODA code
        
        # Convert LODA to a text representation that T5 can understand
        text_loda_codes = []
        for code in loda_codes:
            # Convert LODA code to a more text-like format
            text_code = self.loda_to_text_format(code)
            text_loda_codes.append(text_code)
        
        encoded = self.text_tokenizer(
            text_loda_codes,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )
        
        return encoded
    
    def loda_to_text_format(self, code: str) -> str:
        """
        Convert LODA code to a text format suitable for T5.
        
        This creates a more natural language representation of LODA code.
        
        Args:
            code: LODA assembly code
            
        Returns:
            Text representation of the code
        """
        lines = code.strip().split('\n')
        text_parts = []
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Parse the line and convert to text
            parts = line.replace(',', ' ').split()
            if len(parts) >= 3:
                op, target, source = parts[0], parts[1], parts[2]
                text_parts.append(f"{op} {target} {source}")
            elif len(parts) >= 2:
                op, target = parts[0], parts[1]
                text_parts.append(f"{op} {target}")
            else:
                text_parts.append(line)
        
        return " | ".join(text_parts)
    
    def text_format_to_loda(self, text_code: str) -> str:
        """
        Convert text format back to LODA code.
        
        Args:
            text_code: Text representation of LODA code
            
        Returns:
            LODA assembly code
        """
        parts = text_code.split(" | ")
        loda_lines = []
        
        for part in parts:
            part = part.strip()
            if not part:
                continue
            
            tokens = part.split()
            if len(tokens) >= 3:
                op, target, source = tokens[0], tokens[1], tokens[2]
                loda_lines.append(f"{op} {target},{source}")
            elif len(tokens) >= 2:
                op, target = tokens[0], tokens[1]
                loda_lines.append(f"{op} {target}")
            else:
                loda_lines.append(part)
        
        return '\n'.join(loda_lines)
    
    def forward(self, input_ids, attention_mask, labels=None):
        """
        Forward pass of the model.
        
        Args:
            input_ids: Input token IDs
            attention_mask: Attention mask
            labels: Target labels (for training)
            
        Returns:
            Model outputs
        """
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
    
    def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) -> List[str]:
        """
        Generate LODA code from natural language descriptions.
        
        Args:
            descriptions: List of natural language descriptions
            max_length: Maximum length of generated sequences
            num_beams: Number of beams for beam search
            
        Returns:
            List of generated LODA codes
        """
        # Prepare input
        inputs = self.prepare_input(descriptions)
        
        # Generate with the model
        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True,
                do_sample=False
            )
        
        # Decode generated sequences
        generated_texts = self.text_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        # Convert from text format back to LODA
        loda_codes = [self.text_format_to_loda(text) for text in generated_texts]
        
        return loda_codes
    
    def save_model(self, save_path: str):
        """
        Save the model and tokenizers.
        
        Args:
            save_path: Directory to save the model
        """
        os.makedirs(save_path, exist_ok=True)
        
        # Save T5 model and tokenizer
        self.model.save_pretrained(save_path)
        self.text_tokenizer.save_pretrained(save_path)
        
        # Save LODA tokenizer
        loda_tokenizer_path = os.path.join(save_path, "loda_tokenizer.json")
        with open(loda_tokenizer_path, 'w') as f:
            json.dump({
                'vocab': self.loda_tokenizer.vocab,
                'reverse_vocab': {str(k): v for k, v in self.loda_tokenizer.reverse_vocab.items()}
            }, f, indent=2)
    
    @classmethod
    def load_model(cls, load_path: str):
        """
        Load a saved model.
        
        Args:
            load_path: Directory containing the saved model
            
        Returns:
            Loaded LodaT5Model instance
        """
        # Load T5 model and tokenizer
        model = T5ForConditionalGeneration.from_pretrained(load_path)
        text_tokenizer = T5Tokenizer.from_pretrained(load_path)
        
        # Create model instance
        loda_model = cls()
        loda_model.model = model
        loda_model.text_tokenizer = text_tokenizer
        
        # Load LODA tokenizer if it exists
        loda_tokenizer_path = os.path.join(load_path, "loda_tokenizer.json")
        if os.path.exists(loda_tokenizer_path):
            with open(loda_tokenizer_path, 'r') as f:
                tokenizer_data = json.load(f)
            
            loda_model.loda_tokenizer.vocab = tokenizer_data['vocab']
            loda_model.loda_tokenizer.reverse_vocab = {
                int(k): v for k, v in tokenizer_data['reverse_vocab'].items()
            }
        
        return loda_model

T5-based model for natural language to LODA code generation.

Initialize the model.

Args

model_name: Base T5 model to use
loda_vocab_size: Size of LODA vocabulary (if extending tokenizer)

Ancestors

torch.nn.modules.module.Module

Static methods

def load_model(load_path: str)

Load a saved model.

Args

load_path: Directory containing the saved model

Returns

Loaded LodaT5Model instance

Methods

def forward(self, input_ids, attention_mask, labels=None) ‑> Callable[..., Any]

Expand source code

def forward(self, input_ids, attention_mask, labels=None):
    """
    Forward pass of the model.
    
    Args:
        input_ids: Input token IDs
        attention_mask: Attention mask
        labels: Target labels (for training)
        
    Returns:
        Model outputs
    """
    return self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )

Forward pass of the model.

Args

input_ids: Input token IDs
attention_mask: Attention mask
labels: Target labels (for training)

Returns

Model outputs

def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) ‑> List[str]

Expand source code

def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) -> List[str]:
    """
    Generate LODA code from natural language descriptions.
    
    Args:
        descriptions: List of natural language descriptions
        max_length: Maximum length of generated sequences
        num_beams: Number of beams for beam search
        
    Returns:
        List of generated LODA codes
    """
    # Prepare input
    inputs = self.prepare_input(descriptions)
    
    # Generate with the model
    with torch.no_grad():
        generated_ids = self.model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            do_sample=False
        )
    
    # Decode generated sequences
    generated_texts = self.text_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    # Convert from text format back to LODA
    loda_codes = [self.text_format_to_loda(text) for text in generated_texts]
    
    return loda_codes

Generate LODA code from natural language descriptions.

Args

descriptions: List of natural language descriptions
max_length: Maximum length of generated sequences
num_beams: Number of beams for beam search

Returns

List of generated LODA codes

def loda_to_text_format(self, code: str) ‑> str

Expand source code

def loda_to_text_format(self, code: str) -> str:
    """
    Convert LODA code to a text format suitable for T5.
    
    This creates a more natural language representation of LODA code.
    
    Args:
        code: LODA assembly code
        
    Returns:
        Text representation of the code
    """
    lines = code.strip().split('\n')
    text_parts = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Parse the line and convert to text
        parts = line.replace(',', ' ').split()
        if len(parts) >= 3:
            op, target, source = parts[0], parts[1], parts[2]
            text_parts.append(f"{op} {target} {source}")
        elif len(parts) >= 2:
            op, target = parts[0], parts[1]
            text_parts.append(f"{op} {target}")
        else:
            text_parts.append(line)
    
    return " | ".join(text_parts)

Convert LODA code to a text format suitable for T5.

This creates a more natural language representation of LODA code.

Args

code: LODA assembly code

Returns

Text representation of the code

def prepare_input(self, descriptions: List[str]) ‑> Dict[str, torch.Tensor]

Expand source code

def prepare_input(self, descriptions: List[str]) -> Dict[str, torch.Tensor]:
    """
    Prepare natural language descriptions for input.
    
    Args:
        descriptions: List of natural language descriptions
        
    Returns:
        Dictionary with input tensors
    """
    # Add task prefix for T5
    prefixed_descriptions = [f"translate to loda: {desc}" for desc in descriptions]
    
    # Tokenize with T5 tokenizer
    encoded = self.text_tokenizer(
        prefixed_descriptions,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    return encoded

Prepare natural language descriptions for input.

Args

descriptions: List of natural language descriptions

Returns

Dictionary with input tensors

def prepare_target(self, loda_codes: List[str]) ‑> Dict[str, torch.Tensor]

Expand source code

def prepare_target(self, loda_codes: List[str]) -> Dict[str, torch.Tensor]:
    """
    Prepare LODA codes as targets.
    
    Args:
        loda_codes: List of LODA assembly codes
        
    Returns:
        Dictionary with target tensors
    """
    # For T5, we need to encode targets using the text tokenizer as well
    # We'll create a custom format that represents LODA code
    
    # Convert LODA to a text representation that T5 can understand
    text_loda_codes = []
    for code in loda_codes:
        # Convert LODA code to a more text-like format
        text_code = self.loda_to_text_format(code)
        text_loda_codes.append(text_code)
    
    encoded = self.text_tokenizer(
        text_loda_codes,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    
    return encoded

Prepare LODA codes as targets.

Args

loda_codes: List of LODA assembly codes

Returns

Dictionary with target tensors

def save_model(self, save_path: str)

Expand source code

def save_model(self, save_path: str):
    """
    Save the model and tokenizers.
    
    Args:
        save_path: Directory to save the model
    """
    os.makedirs(save_path, exist_ok=True)
    
    # Save T5 model and tokenizer
    self.model.save_pretrained(save_path)
    self.text_tokenizer.save_pretrained(save_path)
    
    # Save LODA tokenizer
    loda_tokenizer_path = os.path.join(save_path, "loda_tokenizer.json")
    with open(loda_tokenizer_path, 'w') as f:
        json.dump({
            'vocab': self.loda_tokenizer.vocab,
            'reverse_vocab': {str(k): v for k, v in self.loda_tokenizer.reverse_vocab.items()}
        }, f, indent=2)

Save the model and tokenizers.

Args

save_path: Directory to save the model

def text_format_to_loda(self, text_code: str) ‑> str

Expand source code

def text_format_to_loda(self, text_code: str) -> str:
    """
    Convert text format back to LODA code.
    
    Args:
        text_code: Text representation of LODA code
        
    Returns:
        LODA assembly code
    """
    parts = text_code.split(" | ")
    loda_lines = []
    
    for part in parts:
        part = part.strip()
        if not part:
            continue
        
        tokens = part.split()
        if len(tokens) >= 3:
            op, target, source = tokens[0], tokens[1], tokens[2]
            loda_lines.append(f"{op} {target},{source}")
        elif len(tokens) >= 2:
            op, target = tokens[0], tokens[1]
            loda_lines.append(f"{op} {target}")
        else:
            loda_lines.append(part)
    
    return '\n'.join(loda_lines)

Convert text format back to LODA code.

Args

text_code: Text representation of LODA code

Returns

LODA assembly code

class LodaTokenizer

Expand source code

class LodaTokenizer:
    """Custom tokenizer for LODA assembly language."""
    
    def __init__(self):
        """Initialize LODA tokenizer with vocabulary."""
        # LODA operations
        self.operations = [
            'mov', 'add', 'sub', 'mul', 'div', 'dif', 'mod', 'pow', 'gcd', 'bin',
            'cmp', 'min', 'max', 'lpb', 'lpe', 'nop', 'cal', 'seq', 'trn', 'clr'
        ]
        
        # Common operand patterns
        self.operand_patterns = [
            # Direct memory references
            '$0', '$1', '$2', '$3', '$4', '$5', '$6', '$7', '$8', '$9', '$10',
            # Indirect memory references  
            '$$1', '$$2', '$$3', '$$4', '$$5',
            # Common constants
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '-1'
        ]
        
        # Special tokens
        self.special_tokens = ['<pad>', '<unk>', '<s>', '</s>', '<mask>']
        
        # Build vocabulary
        self.vocab = {}
        self.reverse_vocab = {}
        
        # Add special tokens first
        for i, token in enumerate(self.special_tokens):
            self.vocab[token] = i
            self.reverse_vocab[i] = token
        
        # Add operations
        for token in self.operations:
            idx = len(self.vocab)
            self.vocab[token] = idx
            self.reverse_vocab[idx] = token
        
        # Add operand patterns
        for token in self.operand_patterns:
            idx = len(self.vocab)
            self.vocab[token] = idx
            self.reverse_vocab[idx] = token
        
        self.vocab_size = len(self.vocab)
        self.pad_token_id = self.vocab['<pad>']
        self.unk_token_id = self.vocab['<unk>']
        self.bos_token_id = self.vocab['<s>']
        self.eos_token_id = self.vocab['</s>']
    
    def tokenize_loda_code(self, code: str) -> List[str]:
        """
        Tokenize LODA assembly code.
        
        Args:
            code: LODA assembly code as string
            
        Returns:
            List of tokens
        """
        lines = code.strip().split('\n')
        tokens = ['<s>']  # Start token
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Split on whitespace and comma
            parts = line.replace(',', ' ').split()
            
            for part in parts:
                part = part.strip()
                if part in self.vocab:
                    tokens.append(part)
                else:
                    # Try to handle unknown operands
                    if part.startswith('$') and part[1:].isdigit():
                        # Direct memory reference
                        if part in self.vocab:
                            tokens.append(part)
                        else:
                            tokens.append('<unk>')
                    elif part.startswith('$$') and part[2:].isdigit():
                        # Indirect memory reference
                        if part in self.vocab:
                            tokens.append(part)
                        else:
                            tokens.append('<unk>')
                    elif part.lstrip('-').isdigit():
                        # Numeric constant
                        if part in self.vocab:
                            tokens.append(part)
                        else:
                            tokens.append('<unk>')
                    else:
                        tokens.append('<unk>')
        
        tokens.append('</s>')  # End token
        return tokens
    
    def encode_loda_code(self, code: str) -> List[int]:
        """
        Encode LODA code to token IDs.
        
        Args:
            code: LODA assembly code
            
        Returns:
            List of token IDs
        """
        tokens = self.tokenize_loda_code(code)
        return [self.vocab.get(token, self.unk_token_id) for token in tokens]
    
    def decode_loda_code(self, token_ids: List[int]) -> str:
        """
        Decode token IDs back to LODA code.
        
        Args:
            token_ids: List of token IDs
            
        Returns:
            LODA assembly code as string
        """
        tokens = [self.reverse_vocab.get(id, '<unk>') for id in token_ids]
        
        # Filter out special tokens
        filtered_tokens = []
        for token in tokens:
            if token in ['<s>', '</s>', '<pad>']:
                continue
            if token == '<unk>':
                continue
            filtered_tokens.append(token)
        
        # Reconstruct LODA code
        code_lines = []
        i = 0
        
        while i < len(filtered_tokens):
            if i + 2 < len(filtered_tokens):
                # Try to form operation: op target source
                op = filtered_tokens[i]
                if op in self.operations and i + 2 < len(filtered_tokens):
                    target = filtered_tokens[i + 1]
                    source = filtered_tokens[i + 2]
                    code_lines.append(f"{op} {target},{source}")
                    i += 3
                elif op in self.operations and i + 1 < len(filtered_tokens):
                    # Single operand operation
                    target = filtered_tokens[i + 1]
                    code_lines.append(f"{op} {target}")
                    i += 2
                else:
                    i += 1
            else:
                i += 1
        
        return '\n'.join(code_lines)

Custom tokenizer for LODA assembly language.

Initialize LODA tokenizer with vocabulary.

Methods

def decode_loda_code(self, token_ids: List[int]) ‑> str

Expand source code

def decode_loda_code(self, token_ids: List[int]) -> str:
    """
    Decode token IDs back to LODA code.
    
    Args:
        token_ids: List of token IDs
        
    Returns:
        LODA assembly code as string
    """
    tokens = [self.reverse_vocab.get(id, '<unk>') for id in token_ids]
    
    # Filter out special tokens
    filtered_tokens = []
    for token in tokens:
        if token in ['<s>', '</s>', '<pad>']:
            continue
        if token == '<unk>':
            continue
        filtered_tokens.append(token)
    
    # Reconstruct LODA code
    code_lines = []
    i = 0
    
    while i < len(filtered_tokens):
        if i + 2 < len(filtered_tokens):
            # Try to form operation: op target source
            op = filtered_tokens[i]
            if op in self.operations and i + 2 < len(filtered_tokens):
                target = filtered_tokens[i + 1]
                source = filtered_tokens[i + 2]
                code_lines.append(f"{op} {target},{source}")
                i += 3
            elif op in self.operations and i + 1 < len(filtered_tokens):
                # Single operand operation
                target = filtered_tokens[i + 1]
                code_lines.append(f"{op} {target}")
                i += 2
            else:
                i += 1
        else:
            i += 1
    
    return '\n'.join(code_lines)

Decode token IDs back to LODA code.

Args

token_ids: List of token IDs

Returns

LODA assembly code as string

def encode_loda_code(self, code: str) ‑> List[int]

Expand source code

def encode_loda_code(self, code: str) -> List[int]:
    """
    Encode LODA code to token IDs.
    
    Args:
        code: LODA assembly code
        
    Returns:
        List of token IDs
    """
    tokens = self.tokenize_loda_code(code)
    return [self.vocab.get(token, self.unk_token_id) for token in tokens]

Encode LODA code to token IDs.

Args

code: LODA assembly code

Returns

List of token IDs

def tokenize_loda_code(self, code: str) ‑> List[str]

Expand source code

def tokenize_loda_code(self, code: str) -> List[str]:
    """
    Tokenize LODA assembly code.
    
    Args:
        code: LODA assembly code as string
        
    Returns:
        List of tokens
    """
    lines = code.strip().split('\n')
    tokens = ['<s>']  # Start token
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Split on whitespace and comma
        parts = line.replace(',', ' ').split()
        
        for part in parts:
            part = part.strip()
            if part in self.vocab:
                tokens.append(part)
            else:
                # Try to handle unknown operands
                if part.startswith('$') and part[1:].isdigit():
                    # Direct memory reference
                    if part in self.vocab:
                        tokens.append(part)
                    else:
                        tokens.append('<unk>')
                elif part.startswith('$$') and part[2:].isdigit():
                    # Indirect memory reference
                    if part in self.vocab:
                        tokens.append(part)
                    else:
                        tokens.append('<unk>')
                elif part.lstrip('-').isdigit():
                    # Numeric constant
                    if part in self.vocab:
                        tokens.append(part)
                    else:
                        tokens.append('<unk>')
                else:
                    tokens.append('<unk>')
    
    tokens.append('</s>')  # End token
    return tokens

Tokenize LODA assembly code.

Args

code: LODA assembly code as string

Returns

List of tokens

class LodaTrainer (model: LodaT5Model, train_dataset: LodaDataset, val_dataset: LodaDataset | None = None, learning_rate: float = 5e-05, batch_size: int = 8, num_epochs: int = 3, warmup_steps: int = 500, save_dir: str = 'loda_llm_model')

Expand source code

class LodaTrainer:
    """Trainer class for LODA LLM."""
    
    def __init__(self, 
                 model: LodaT5Model,
                 train_dataset: LodaDataset,
                 val_dataset: Optional[LodaDataset] = None,
                 learning_rate: float = 5e-5,
                 batch_size: int = 8,
                 num_epochs: int = 3,
                 warmup_steps: int = 500,
                 save_dir: str = "loda_llm_model"):
        """
        Initialize the trainer.
        
        Args:
            model: LodaT5Model to train
            train_dataset: Training dataset
            val_dataset: Validation dataset (optional)
            learning_rate: Learning rate
            batch_size: Batch size
            num_epochs: Number of training epochs
            warmup_steps: Number of warmup steps for learning rate schedule
            save_dir: Directory to save the model
        """
        self.model = model
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.warmup_steps = warmup_steps
        self.save_dir = save_dir
        
        # Set up device
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.model.to(self.device)
        
        # Set up data loaders
        self.train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True,
            collate_fn=self._collate_fn
        )
        
        if val_dataset:
            self.val_loader = DataLoader(
                val_dataset, 
                batch_size=batch_size, 
                shuffle=False,
                collate_fn=self._collate_fn
            )
        
        # Set up optimizer
        self.optimizer = AdamW(
            self.model.model.parameters(),
            lr=learning_rate,
            weight_decay=0.01
        )
        
        # Set up learning rate scheduler
        total_steps = len(self.train_loader) * num_epochs
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )
    
    def _collate_fn(self, batch):
        """Collate function for DataLoader."""
        # Pad sequences to the same length
        input_ids = [item['input_ids'] for item in batch]
        attention_masks = [item['attention_mask'] for item in batch]
        labels = [item['labels'] for item in batch]
        decoder_attention_masks = [item['decoder_attention_mask'] for item in batch]
        
        # Pad input sequences
        max_input_len = max(len(seq) for seq in input_ids)
        padded_input_ids = []
        padded_attention_masks = []
        
        for i in range(len(input_ids)):
            seq_len = len(input_ids[i])
            pad_len = max_input_len - seq_len
            
            padded_input_ids.append(
                torch.cat([input_ids[i], torch.zeros(pad_len, dtype=torch.long)])
            )
            padded_attention_masks.append(
                torch.cat([attention_masks[i], torch.zeros(pad_len, dtype=torch.long)])
            )
        
        # Pad target sequences
        max_target_len = max(len(seq) for seq in labels)
        padded_labels = []
        padded_decoder_masks = []
        
        for i in range(len(labels)):
            seq_len = len(labels[i])
            pad_len = max_target_len - seq_len
            
            # For labels, use -100 for padding (ignored in loss calculation)
            padded_labels.append(
                torch.cat([labels[i], torch.full((pad_len,), -100, dtype=torch.long)])
            )
            padded_decoder_masks.append(
                torch.cat([decoder_attention_masks[i], torch.zeros(pad_len, dtype=torch.long)])
            )
        
        return {
            'input_ids': torch.stack(padded_input_ids),
            'attention_mask': torch.stack(padded_attention_masks),
            'labels': torch.stack(padded_labels),
            'decoder_attention_mask': torch.stack(padded_decoder_masks)
        }
    
    def train_epoch(self):
        """Train for one epoch."""
        self.model.model.train()
        total_loss = 0
        
        progress_bar = tqdm(self.train_loader, desc="Training")
        
        for batch in progress_bar:
            # Move to device
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            # Forward pass
            outputs = self.model.forward(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass
            loss.backward()
            
            # Clip gradients
            torch.nn.utils.clip_grad_norm_(self.model.model.parameters(), 1.0)
            
            # Update parameters
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()
            
            # Update progress bar
            progress_bar.set_postfix({'loss': loss.item()})
        
        return total_loss / len(self.train_loader)
    
    def validate(self):
        """Validate the model."""
        if not self.val_dataset:
            return None
        
        self.model.model.eval()
        total_loss = 0
        
        with torch.no_grad():
            progress_bar = tqdm(self.val_loader, desc="Validation")
            
            for batch in progress_bar:
                # Move to device
                batch = {k: v.to(self.device) for k, v in batch.items()}
                
                # Forward pass
                outputs = self.model.forward(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels']
                )
                
                loss = outputs.loss
                total_loss += loss.item()
                
                progress_bar.set_postfix({'val_loss': loss.item()})
        
        return total_loss / len(self.val_loader)
    
    def train(self):
        """Train the model."""
        print(f"Training on device: {self.device}")
        print(f"Training examples: {len(self.train_dataset)}")
        if self.val_dataset:
            print(f"Validation examples: {len(self.val_dataset)}")
        
        best_val_loss = float('inf')
        
        for epoch in range(self.num_epochs):
            print(f"\nEpoch {epoch + 1}/{self.num_epochs}")
            
            # Train
            train_loss = self.train_epoch()
            print(f"Training loss: {train_loss:.4f}")
            
            # Validate
            val_loss = self.validate()
            if val_loss is not None:
                print(f"Validation loss: {val_loss:.4f}")
                
                # Save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    self.save_model(f"{self.save_dir}_best")
                    print("Saved best model")
            
            # Save checkpoint
            self.save_model(f"{self.save_dir}_epoch_{epoch + 1}")
        
        print("\nTraining completed!")
        return self.model
    
    def save_model(self, path: str):
        """Save the model."""
        self.model.save_model(path)

Trainer class for LODA LLM.

Initialize the trainer.

Args

model: LodaT5Model to train
train_dataset: Training dataset
val_dataset: Validation dataset (optional)
learning_rate: Learning rate
batch_size: Batch size
num_epochs: Number of training epochs
warmup_steps: Number of warmup steps for learning rate schedule
save_dir: Directory to save the model

Methods

def save_model(self, path: str)

Expand source code

def save_model(self, path: str):
    """Save the model."""
    self.model.save_model(path)

Save the model.

def train(self)

Expand source code

def train(self):
    """Train the model."""
    print(f"Training on device: {self.device}")
    print(f"Training examples: {len(self.train_dataset)}")
    if self.val_dataset:
        print(f"Validation examples: {len(self.val_dataset)}")
    
    best_val_loss = float('inf')
    
    for epoch in range(self.num_epochs):
        print(f"\nEpoch {epoch + 1}/{self.num_epochs}")
        
        # Train
        train_loss = self.train_epoch()
        print(f"Training loss: {train_loss:.4f}")
        
        # Validate
        val_loss = self.validate()
        if val_loss is not None:
            print(f"Validation loss: {val_loss:.4f}")
            
            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                self.save_model(f"{self.save_dir}_best")
                print("Saved best model")
        
        # Save checkpoint
        self.save_model(f"{self.save_dir}_epoch_{epoch + 1}")
    
    print("\nTraining completed!")
    return self.model

Train the model.

def train_epoch(self)

Expand source code

def train_epoch(self):
    """Train for one epoch."""
    self.model.model.train()
    total_loss = 0
    
    progress_bar = tqdm(self.train_loader, desc="Training")
    
    for batch in progress_bar:
        # Move to device
        batch = {k: v.to(self.device) for k, v in batch.items()}
        
        # Forward pass
        outputs = self.model.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(self.model.model.parameters(), 1.0)
        
        # Update parameters
        self.optimizer.step()
        self.scheduler.step()
        self.optimizer.zero_grad()
        
        # Update progress bar
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(self.train_loader)

Train for one epoch.

def validate(self)

Expand source code

def validate(self):
    """Validate the model."""
    if not self.val_dataset:
        return None
    
    self.model.model.eval()
    total_loss = 0
    
    with torch.no_grad():
        progress_bar = tqdm(self.val_loader, desc="Validation")
        
        for batch in progress_bar:
            # Move to device
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            # Forward pass
            outputs = self.model.forward(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            progress_bar.set_postfix({'val_loss': loss.item()})
    
    return total_loss / len(self.val_loader)

Validate the model.

class TrainingExample (sequence_id: str, description: str, loda_code: str, terms: List[int] | None = None)

Expand source code

@dataclass
class TrainingExample:
    """A single training example pairing natural language with LODA code."""
    sequence_id: str
    description: str
    loda_code: str
    terms: Optional[List[int]] = None

A single training example pairing natural language with LODA code.

Instance variables

var description : str
var loda_code : str
var sequence_id : str
var terms : List[int] | None