Module loda.llm

Large Language Model (LLM) implementation for natural language to LODA code generation.

This module provides functionality to train transformer-based models that can understand natural language descriptions of integer sequences (like OEIS sequences) and generate corresponding LODA assembly programs.

Key components: - Data preprocessing for OEIS sequence descriptions and LODA programs - Transformer-based encoder-decoder architecture - Training pipeline with proper tokenization - Inference utilities for code generation - Evaluation metrics for generated programs

Example usage:

>>> from loda.llm import LodaT5Model, LodaGenerator, train_loda_llm
>>> 
>>> # Train a model
>>> model = train_loda_llm("programs/oeis", "trained_model")
>>> 
>>> # Generate code
>>> generator = LodaGenerator(model)
>>> results = generator.generate("Fibonacci numbers")
>>> print(results[0].generated_code)

Sub-modules

loda.llm.data_preprocessing

Data preprocessing utilities for LLM training on OEIS sequences and LODA programs …

loda.llm.inference

Inference and evaluation utilities for the LODA LLM …

loda.llm.model

Transformer-based model for natural language to LODA code generation …

loda.llm.trainer

Training script for the LODA LLM (Large Language Model) …

Functions

def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True)
Expand source code
def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True):
    """
    Convenience function to create and save a training dataset.
    
    Args:
        programs_dir: Path to OEIS programs directory
        output_file: Path to save the dataset
        max_examples: Maximum number of examples (-1 for all)
        augment: Whether to augment with description variations
    """
    preprocessor = DataPreprocessor(programs_dir)
    examples = preprocessor.create_training_examples(max_examples)
    
    if augment:
        examples = preprocessor.augment_descriptions(examples)
    
    preprocessor.save_dataset(examples, output_file)
    return examples

Convenience function to create and save a training dataset.

Args

programs_dir
Path to OEIS programs directory
output_file
Path to save the dataset
max_examples
Maximum number of examples (-1 for all)
augment
Whether to augment with description variations
def train_loda_llm(programs_dir: str,
output_dir: str = 'loda_llm_model',
model_name: str = 't5-small',
max_examples: int = -1,
val_split: float = 0.1,
batch_size: int = 8,
learning_rate: float = 5e-05,
num_epochs: int = 3)
Expand source code
def train_loda_llm(programs_dir: str,
                   output_dir: str = "loda_llm_model",
                   model_name: str = "t5-small",
                   max_examples: int = -1,
                   val_split: float = 0.1,
                   batch_size: int = 8,
                   learning_rate: float = 5e-5,
                   num_epochs: int = 3):
    """
    Main training function.
    
    Args:
        programs_dir: Directory containing OEIS programs
        output_dir: Directory to save the trained model
        model_name: Base T5 model to use
        max_examples: Maximum number of training examples (-1 for all)
        val_split: Fraction of data to use for validation
        batch_size: Training batch size
        learning_rate: Learning rate
        num_epochs: Number of training epochs
    """
    print("Preparing training data...")
    
    # Create training examples
    preprocessor = DataPreprocessor(programs_dir)
    examples = preprocessor.create_training_examples(max_examples)
    
    if len(examples) == 0:
        print("No training examples found!")
        return None
    
    # Augment examples
    print("Augmenting training examples...")
    examples = preprocessor.augment_descriptions(examples)
    
    # Split into train/validation
    if val_split > 0:
        split_idx = int(len(examples) * (1 - val_split))
        train_examples = examples[:split_idx]
        val_examples = examples[split_idx:]
    else:
        train_examples = examples
        val_examples = None
    
    print(f"Training examples: {len(train_examples)}")
    if val_examples:
        print(f"Validation examples: {len(val_examples)}")
    
    # Create model
    print(f"Creating model based on {model_name}...")
    model = LodaT5Model(model_name)
    
    # Create datasets
    train_dataset = LodaDataset(train_examples, model)
    val_dataset = LodaDataset(val_examples, model) if val_examples else None
    
    # Create trainer
    trainer = LodaTrainer(
        model=model,
        train_dataset=train_dataset,
        val_dataset=val_dataset,
        learning_rate=learning_rate,
        batch_size=batch_size,
        num_epochs=num_epochs,
        save_dir=output_dir
    )
    
    # Train the model
    trained_model = trainer.train()
    
    # Save final model
    trained_model.save_model(output_dir)
    print(f"Final model saved to {output_dir}")
    
    return trained_model

Main training function.

Args

programs_dir
Directory containing OEIS programs
output_dir
Directory to save the trained model
model_name
Base T5 model to use
max_examples
Maximum number of training examples (-1 for all)
val_split
Fraction of data to use for validation
batch_size
Training batch size
learning_rate
Learning rate
num_epochs
Number of training epochs

Classes

class DataPreprocessor (programs_dir: str)
Expand source code
class DataPreprocessor:
    """Handles preprocessing of OEIS programs for LLM training."""
    
    def __init__(self, programs_dir: str):
        """Initialize with path to OEIS programs directory."""
        self.programs_dir = programs_dir
        self.program_cache = ProgramCache(programs_dir)
        
    def extract_description_from_program(self, program_text: str) -> Optional[str]:
        """
        Extract the natural language description from a LODA program.
        
        LODA programs typically start with comments like:
        ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            The description string or None if no description found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for OEIS description lines (start with ; A######:)
            match = re.match(r';\s*A\d{6}:\s*(.+)', line)
            if match:
                description = match.group(1).strip()
                # Clean up common artifacts
                description = description.rstrip('.')
                # Remove mathematical notation that might be confusing
                # Keep it simple for initial training
                return description
                
        return None
    
    def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
        """
        Extract the sequence terms from a LODA program comment.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            List of sequence terms or None if not found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for lines with comma-separated numbers (sequence terms)
            if line.startswith(';') and ',' in line:
                # Extract numbers from the line
                numbers_str = line[1:].strip()  # Remove the ';'
                # Skip if it looks like it contains non-numeric content
                if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                    continue
                    
                try:
                    terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                    if len(terms) >= 5:  # Reasonable number of terms
                        return terms
                except ValueError:
                    continue
                    
        return None
    
    def clean_loda_code(self, program_text: str) -> str:
        """
        Clean LODA code by removing comments and normalizing format.
        
        Args:
            program_text: Raw LODA program text
            
        Returns:
            Cleaned LODA code suitable for training
        """
        lines = program_text.strip().split('\n')
        code_lines = []
        
        for line in lines:
            # Skip comment lines (lines that start with ;)
            if line.strip().startswith(';'):
                continue
            # Skip empty lines
            if not line.strip():
                continue
            
            # Remove inline comments (everything after ; on the same line)
            if ';' in line:
                code_part = line.split(';')[0].strip()
            else:
                code_part = line.strip()
            
            # Only add non-empty code lines
            if code_part:
                code_lines.append(code_part)
        
        return '\n'.join(code_lines)
    
    def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
        """
        Create training examples from all available LODA programs.
        
        Args:
            max_examples: Maximum number of examples to create (-1 for all)
            
        Returns:
            List of TrainingExample objects
        """
        examples = []
        program_ids = self.program_cache.all_ids()
        
        if max_examples > 0:
            program_ids = program_ids[:max_examples]
        
        print(f"Processing {len(program_ids)} programs...")
        
        for i, program_id in enumerate(program_ids):
            if i % 1000 == 0:
                print(f"Processed {i}/{len(program_ids)} programs")
                
            try:
                # Read the program file
                program_path = self.program_cache.path(program_id)
                if not os.path.exists(program_path):
                    continue
                    
                with open(program_path, 'r') as f:
                    program_text = f.read()
                
                # Extract description
                description = self.extract_description_from_program(program_text)
                if not description:
                    continue
                
                # Extract terms (optional)
                terms = self.extract_terms_from_program(program_text)
                
                # Clean the LODA code
                clean_code = self.clean_loda_code(program_text)
                if not clean_code:
                    continue
                
                # Validate that the code parses correctly
                try:
                    Program(clean_code)
                except Exception:
                    continue  # Skip programs that don't parse
                
                example = TrainingExample(
                    sequence_id=str(program_id),
                    description=description,
                    loda_code=clean_code,
                    terms=terms
                )
                examples.append(example)
                
            except Exception as e:
                print(f"Error processing {program_id}: {e}")
                continue
        
        print(f"Created {len(examples)} training examples")
        return examples
    
    def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
        """
        Augment training examples with variations of descriptions.
        
        This can help make the model more robust to different phrasings.
        
        Args:
            examples: List of original training examples
            
        Returns:
            Augmented list with additional variations
        """
        augmented = list(examples)  # Start with originals
        
        for example in examples:
            desc = example.description
            
            # Create variations
            variations = []
            
            # Add "sequence of" prefix if not present
            if not desc.lower().startswith(('sequence', 'the sequence')):
                variations.append(f"Sequence of {desc.lower()}")
            
            # Add "Generate" prefix
            variations.append(f"Generate {desc.lower()}")
            
            # Add "Compute" prefix
            variations.append(f"Compute {desc.lower()}")
            
            # Remove mathematical symbols for simpler versions
            simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
            simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
            if simple_desc != desc and simple_desc:
                variations.append(simple_desc)
            
            # Create new examples for each variation
            for variation in variations:
                augmented_example = TrainingExample(
                    sequence_id=str(example.sequence_id) + "_aug",
                    description=variation,
                    loda_code=example.loda_code,
                    terms=example.terms
                )
                augmented.append(augmented_example)
        
        return augmented
    
    def save_dataset(self, examples: List[TrainingExample], output_file: str):
        """
        Save training examples to a file for later use.
        
        Args:
            examples: List of training examples
            output_file: Path to output file
        """
        import json
        
        data = []
        for example in examples:
            data.append({
                'sequence_id': example.sequence_id,
                'description': example.description,
                'loda_code': example.loda_code,
                'terms': example.terms
            })
        
        with open(output_file, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"Saved {len(examples)} examples to {output_file}")
    
    def load_dataset(self, input_file: str) -> List[TrainingExample]:
        """
        Load training examples from a file.
        
        Args:
            input_file: Path to input file
            
        Returns:
            List of TrainingExample objects
        """
        import json
        
        with open(input_file, 'r') as f:
            data = json.load(f)
        
        examples = []
        for item in data:
            example = TrainingExample(
                sequence_id=item['sequence_id'],
                description=item['description'],
                loda_code=item['loda_code'],
                terms=item.get('terms')
            )
            examples.append(example)
        
        print(f"Loaded {len(examples)} examples from {input_file}")
        return examples

Handles preprocessing of OEIS programs for LLM training.

Initialize with path to OEIS programs directory.

Methods

def augment_descriptions(self,
examples: List[TrainingExample]) ‑> List[TrainingExample]
Expand source code
def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
    """
    Augment training examples with variations of descriptions.
    
    This can help make the model more robust to different phrasings.
    
    Args:
        examples: List of original training examples
        
    Returns:
        Augmented list with additional variations
    """
    augmented = list(examples)  # Start with originals
    
    for example in examples:
        desc = example.description
        
        # Create variations
        variations = []
        
        # Add "sequence of" prefix if not present
        if not desc.lower().startswith(('sequence', 'the sequence')):
            variations.append(f"Sequence of {desc.lower()}")
        
        # Add "Generate" prefix
        variations.append(f"Generate {desc.lower()}")
        
        # Add "Compute" prefix
        variations.append(f"Compute {desc.lower()}")
        
        # Remove mathematical symbols for simpler versions
        simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
        simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
        if simple_desc != desc and simple_desc:
            variations.append(simple_desc)
        
        # Create new examples for each variation
        for variation in variations:
            augmented_example = TrainingExample(
                sequence_id=str(example.sequence_id) + "_aug",
                description=variation,
                loda_code=example.loda_code,
                terms=example.terms
            )
            augmented.append(augmented_example)
    
    return augmented

Augment training examples with variations of descriptions.

This can help make the model more robust to different phrasings.

Args

examples
List of original training examples

Returns

Augmented list with additional variations

def clean_loda_code(self, program_text: str) ‑> str
Expand source code
def clean_loda_code(self, program_text: str) -> str:
    """
    Clean LODA code by removing comments and normalizing format.
    
    Args:
        program_text: Raw LODA program text
        
    Returns:
        Cleaned LODA code suitable for training
    """
    lines = program_text.strip().split('\n')
    code_lines = []
    
    for line in lines:
        # Skip comment lines (lines that start with ;)
        if line.strip().startswith(';'):
            continue
        # Skip empty lines
        if not line.strip():
            continue
        
        # Remove inline comments (everything after ; on the same line)
        if ';' in line:
            code_part = line.split(';')[0].strip()
        else:
            code_part = line.strip()
        
        # Only add non-empty code lines
        if code_part:
            code_lines.append(code_part)
    
    return '\n'.join(code_lines)

Clean LODA code by removing comments and normalizing format.

Args

program_text
Raw LODA program text

Returns

Cleaned LODA code suitable for training

def create_training_examples(self, max_examples: int = -1) ‑> List[TrainingExample]
Expand source code
def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
    """
    Create training examples from all available LODA programs.
    
    Args:
        max_examples: Maximum number of examples to create (-1 for all)
        
    Returns:
        List of TrainingExample objects
    """
    examples = []
    program_ids = self.program_cache.all_ids()
    
    if max_examples > 0:
        program_ids = program_ids[:max_examples]
    
    print(f"Processing {len(program_ids)} programs...")
    
    for i, program_id in enumerate(program_ids):
        if i % 1000 == 0:
            print(f"Processed {i}/{len(program_ids)} programs")
            
        try:
            # Read the program file
            program_path = self.program_cache.path(program_id)
            if not os.path.exists(program_path):
                continue
                
            with open(program_path, 'r') as f:
                program_text = f.read()
            
            # Extract description
            description = self.extract_description_from_program(program_text)
            if not description:
                continue
            
            # Extract terms (optional)
            terms = self.extract_terms_from_program(program_text)
            
            # Clean the LODA code
            clean_code = self.clean_loda_code(program_text)
            if not clean_code:
                continue
            
            # Validate that the code parses correctly
            try:
                Program(clean_code)
            except Exception:
                continue  # Skip programs that don't parse
            
            example = TrainingExample(
                sequence_id=str(program_id),
                description=description,
                loda_code=clean_code,
                terms=terms
            )
            examples.append(example)
            
        except Exception as e:
            print(f"Error processing {program_id}: {e}")
            continue
    
    print(f"Created {len(examples)} training examples")
    return examples

Create training examples from all available LODA programs.

Args

max_examples
Maximum number of examples to create (-1 for all)

Returns

List of TrainingExample objects

def extract_description_from_program(self, program_text: str) ‑> str | None
Expand source code
def extract_description_from_program(self, program_text: str) -> Optional[str]:
    """
    Extract the natural language description from a LODA program.
    
    LODA programs typically start with comments like:
    ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        The description string or None if no description found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for OEIS description lines (start with ; A######:)
        match = re.match(r';\s*A\d{6}:\s*(.+)', line)
        if match:
            description = match.group(1).strip()
            # Clean up common artifacts
            description = description.rstrip('.')
            # Remove mathematical notation that might be confusing
            # Keep it simple for initial training
            return description
            
    return None

Extract the natural language description from a LODA program.

LODA programs typically start with comments like: ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.

Args

program_text
The full LODA program as text

Returns

The description string or None if no description found

def extract_terms_from_program(self, program_text: str) ‑> List[int] | None
Expand source code
def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
    """
    Extract the sequence terms from a LODA program comment.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        List of sequence terms or None if not found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for lines with comma-separated numbers (sequence terms)
        if line.startswith(';') and ',' in line:
            # Extract numbers from the line
            numbers_str = line[1:].strip()  # Remove the ';'
            # Skip if it looks like it contains non-numeric content
            if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                continue
                
            try:
                terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                if len(terms) >= 5:  # Reasonable number of terms
                    return terms
            except ValueError:
                continue
                
    return None

Extract the sequence terms from a LODA program comment.

Args

program_text
The full LODA program as text

Returns

List of sequence terms or None if not found

def load_dataset(self, input_file: str) ‑> List[TrainingExample]
Expand source code
def load_dataset(self, input_file: str) -> List[TrainingExample]:
    """
    Load training examples from a file.
    
    Args:
        input_file: Path to input file
        
    Returns:
        List of TrainingExample objects
    """
    import json
    
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    examples = []
    for item in data:
        example = TrainingExample(
            sequence_id=item['sequence_id'],
            description=item['description'],
            loda_code=item['loda_code'],
            terms=item.get('terms')
        )
        examples.append(example)
    
    print(f"Loaded {len(examples)} examples from {input_file}")
    return examples

Load training examples from a file.

Args

input_file
Path to input file

Returns

List of TrainingExample objects

def save_dataset(self,
examples: List[TrainingExample],
output_file: str)
Expand source code
def save_dataset(self, examples: List[TrainingExample], output_file: str):
    """
    Save training examples to a file for later use.
    
    Args:
        examples: List of training examples
        output_file: Path to output file
    """
    import json
    
    data = []
    for example in examples:
        data.append({
            'sequence_id': example.sequence_id,
            'description': example.description,
            'loda_code': example.loda_code,
            'terms': example.terms
        })
    
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Saved {len(examples)} examples to {output_file}")

Save training examples to a file for later use.

Args

examples
List of training examples
output_file
Path to output file
class GenerationResult (description: str,
generated_code: str,
is_valid: bool,
error_message: str | None = None,
generated_sequence: List[int] | None = None,
generation_time: float = 0.0)
Expand source code
@dataclass
class GenerationResult:
    """Result of code generation."""
    description: str
    generated_code: str
    is_valid: bool
    error_message: Optional[str] = None
    generated_sequence: Optional[List[int]] = None
    generation_time: float = 0.0

Result of code generation.

Instance variables

var description : str
var error_message : str | None
var generated_code : str
var generated_sequence : List[int] | None
var generation_time : float
var is_valid : bool
class LodaEvaluator (model: LodaT5Model)
Expand source code
class LodaEvaluator:
    """Evaluator for assessing model performance."""
    
    def __init__(self, model: LodaT5Model):
        """
        Initialize the evaluator.
        
        Args:
            model: Trained LodaT5Model to evaluate
        """
        self.model = model
        self.generator = LodaGenerator(model)
    
    def evaluate_examples(self, test_examples: List[TrainingExample]) -> Dict[str, float]:
        """
        Evaluate the model on test examples.
        
        Args:
            test_examples: List of test examples
            
        Returns:
            Dictionary with evaluation metrics
        """
        print(f"Evaluating on {len(test_examples)} examples...")
        
        total_examples = len(test_examples)
        valid_programs = 0
        exact_matches = 0
        sequence_matches = 0
        total_generation_time = 0
        
        results = []
        
        for i, example in enumerate(test_examples):
            if i % 10 == 0:
                print(f"Progress: {i}/{total_examples}")
            
            # Generate code
            generation_results = self.generator.generate(example.description, num_samples=1)
            
            if generation_results:
                result = generation_results[0]
                results.append(result)
                
                total_generation_time += result.generation_time
                
                if result.is_valid:
                    valid_programs += 1
                    
                    # Check for exact match
                    if self._normalize_code(result.generated_code) == self._normalize_code(example.loda_code):
                        exact_matches += 1
                    
                    # Check for sequence match (if we have expected terms)
                    if (example.terms and result.generated_sequence and 
                        len(result.generated_sequence) >= 3 and
                        result.generated_sequence[:3] == example.terms[:3]):
                        sequence_matches += 1
        
        # Calculate metrics
        metrics = {
            'total_examples': total_examples,
            'valid_program_rate': valid_programs / total_examples if total_examples > 0 else 0,
            'exact_match_rate': exact_matches / total_examples if total_examples > 0 else 0,
            'sequence_match_rate': sequence_matches / total_examples if total_examples > 0 else 0,
            'avg_generation_time': total_generation_time / total_examples if total_examples > 0 else 0,
            'valid_programs': valid_programs,
            'exact_matches': exact_matches,
            'sequence_matches': sequence_matches
        }
        
        return metrics, results
    
    def _normalize_code(self, code: str) -> str:
        """Normalize code for comparison."""
        # Remove extra whitespace and normalize format
        lines = []
        for line in code.strip().split('\n'):
            line = line.strip()
            if line:
                lines.append(line)
        return '\n'.join(lines)
    
    def print_evaluation_report(self, metrics: Dict[str, float], results: List[GenerationResult]):
        """Print a detailed evaluation report."""
        print("\n" + "="*60)
        print("LODA LLM EVALUATION REPORT")
        print("="*60)
        
        print(f"Total Examples: {metrics['total_examples']}")
        print(f"Valid Programs: {metrics['valid_programs']} ({metrics['valid_program_rate']:.1%})")
        print(f"Exact Matches: {metrics['exact_matches']} ({metrics['exact_match_rate']:.1%})")
        print(f"Sequence Matches: {metrics['sequence_matches']} ({metrics['sequence_match_rate']:.1%})")
        print(f"Avg Generation Time: {metrics['avg_generation_time']:.2f}s")
        
        # Show some example results
        print("\n" + "-"*60)
        print("SAMPLE RESULTS")
        print("-"*60)
        
        # Show successful examples
        successful = [r for r in results if r.is_valid]
        if successful:
            print("\nSuccessful generations:")
            for i, result in enumerate(successful[:3]):  # Show first 3
                print(f"\n{i+1}. Description: {result.description}")
                print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")
                if result.generated_sequence:
                    print(f"   Sequence: {result.generated_sequence}")
        
        # Show failed examples
        failed = [r for r in results if not r.is_valid]
        if failed:
            print(f"\nFailed generations ({len(failed)} total):")
            for i, result in enumerate(failed[:3]):  # Show first 3
                print(f"\n{i+1}. Description: {result.description}")
                print(f"   Error: {result.error_message}")
                print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")

Evaluator for assessing model performance.

Initialize the evaluator.

Args

model
Trained LodaT5Model to evaluate

Methods

def evaluate_examples(self,
test_examples: List[TrainingExample]) ‑> Dict[str, float]
Expand source code
def evaluate_examples(self, test_examples: List[TrainingExample]) -> Dict[str, float]:
    """
    Evaluate the model on test examples.
    
    Args:
        test_examples: List of test examples
        
    Returns:
        Dictionary with evaluation metrics
    """
    print(f"Evaluating on {len(test_examples)} examples...")
    
    total_examples = len(test_examples)
    valid_programs = 0
    exact_matches = 0
    sequence_matches = 0
    total_generation_time = 0
    
    results = []
    
    for i, example in enumerate(test_examples):
        if i % 10 == 0:
            print(f"Progress: {i}/{total_examples}")
        
        # Generate code
        generation_results = self.generator.generate(example.description, num_samples=1)
        
        if generation_results:
            result = generation_results[0]
            results.append(result)
            
            total_generation_time += result.generation_time
            
            if result.is_valid:
                valid_programs += 1
                
                # Check for exact match
                if self._normalize_code(result.generated_code) == self._normalize_code(example.loda_code):
                    exact_matches += 1
                
                # Check for sequence match (if we have expected terms)
                if (example.terms and result.generated_sequence and 
                    len(result.generated_sequence) >= 3 and
                    result.generated_sequence[:3] == example.terms[:3]):
                    sequence_matches += 1
    
    # Calculate metrics
    metrics = {
        'total_examples': total_examples,
        'valid_program_rate': valid_programs / total_examples if total_examples > 0 else 0,
        'exact_match_rate': exact_matches / total_examples if total_examples > 0 else 0,
        'sequence_match_rate': sequence_matches / total_examples if total_examples > 0 else 0,
        'avg_generation_time': total_generation_time / total_examples if total_examples > 0 else 0,
        'valid_programs': valid_programs,
        'exact_matches': exact_matches,
        'sequence_matches': sequence_matches
    }
    
    return metrics, results

Evaluate the model on test examples.

Args

test_examples
List of test examples

Returns

Dictionary with evaluation metrics

def print_evaluation_report(self,
metrics: Dict[str, float],
results: List[GenerationResult])
Expand source code
def print_evaluation_report(self, metrics: Dict[str, float], results: List[GenerationResult]):
    """Print a detailed evaluation report."""
    print("\n" + "="*60)
    print("LODA LLM EVALUATION REPORT")
    print("="*60)
    
    print(f"Total Examples: {metrics['total_examples']}")
    print(f"Valid Programs: {metrics['valid_programs']} ({metrics['valid_program_rate']:.1%})")
    print(f"Exact Matches: {metrics['exact_matches']} ({metrics['exact_match_rate']:.1%})")
    print(f"Sequence Matches: {metrics['sequence_matches']} ({metrics['sequence_match_rate']:.1%})")
    print(f"Avg Generation Time: {metrics['avg_generation_time']:.2f}s")
    
    # Show some example results
    print("\n" + "-"*60)
    print("SAMPLE RESULTS")
    print("-"*60)
    
    # Show successful examples
    successful = [r for r in results if r.is_valid]
    if successful:
        print("\nSuccessful generations:")
        for i, result in enumerate(successful[:3]):  # Show first 3
            print(f"\n{i+1}. Description: {result.description}")
            print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")
            if result.generated_sequence:
                print(f"   Sequence: {result.generated_sequence}")
    
    # Show failed examples
    failed = [r for r in results if not r.is_valid]
    if failed:
        print(f"\nFailed generations ({len(failed)} total):")
        for i, result in enumerate(failed[:3]):  # Show first 3
            print(f"\n{i+1}. Description: {result.description}")
            print(f"   Error: {result.error_message}")
            print(f"   Generated: {result.generated_code.replace(chr(10), '; ')}")

Print a detailed evaluation report.

class LodaGenerator (model: LodaT5Model,
max_length: int = 256,
num_beams: int = 4)
Expand source code
class LodaGenerator:
    """Generator class for creating LODA code from natural language."""
    
    def __init__(self, model: LodaT5Model, max_length: int = 256, num_beams: int = 4):
        """
        Initialize the generator.
        
        Args:
            model: Trained LodaT5Model
            max_length: Maximum length of generated code
            num_beams: Number of beams for beam search
        """
        self.model = model
        self.max_length = max_length
        self.num_beams = num_beams
    
    def generate(self, description: str, num_samples: int = 1) -> List[GenerationResult]:
        """
        Generate LODA code from a natural language description.
        
        Args:
            description: Natural language description of the sequence
            num_samples: Number of code samples to generate
            
        Returns:
            List of GenerationResult objects
        """
        start_time = time.time()
        
        # Generate multiple samples
        descriptions = [description] * num_samples
        generated_codes = self.model.generate(
            descriptions, 
            max_length=self.max_length,
            num_beams=self.num_beams
        )
        
        generation_time = time.time() - start_time
        
        results = []
        for code in generated_codes:
            result = self._validate_and_evaluate_code(description, code)
            result.generation_time = generation_time / num_samples
            results.append(result)
        
        return results
    
    def _validate_and_evaluate_code(self, description: str, code: str) -> GenerationResult:
        """
        Validate and evaluate generated LODA code.
        
        Args:
            description: Original description
            code: Generated LODA code
            
        Returns:
            GenerationResult with validation info
        """
        result = GenerationResult(
            description=description,
            generated_code=code,
            is_valid=False
        )
        
        try:
            # Try to parse the program
            program = Program(code)
            
            # Try to evaluate it for a few terms
            interpreter = Interpreter(max_memory=100, max_stack=10, max_steps=10000)
            evaluator = Evaluator(program, interpreter)
            
            sequence_terms = []
            for i in range(10):  # Generate first 10 terms
                try:
                    term = evaluator(i)
                    sequence_terms.append(term)
                except Exception:
                    break  # Stop if evaluation fails
            
            if len(sequence_terms) >= 3:  # At least 3 terms generated
                result.is_valid = True
                result.generated_sequence = sequence_terms
            else:
                result.error_message = "Could not generate sufficient sequence terms"
        
        except Exception as e:
            result.error_message = f"Program validation failed: {str(e)}"
        
        return result
    
    def generate_interactive(self):
        """Interactive mode for generating LODA code."""
        print("LODA Code Generator - Interactive Mode")
        print("Enter natural language descriptions to generate LODA code.")
        print("Type 'quit' to exit.\n")
        
        while True:
            try:
                description = input("Description: ").strip()
                
                if description.lower() in ['quit', 'exit', 'q']:
                    print("Goodbye!")
                    break
                
                if not description:
                    continue
                
                print("Generating code...")
                results = self.generate(description, num_samples=1)
                
                for i, result in enumerate(results):
                    print(f"\n--- Result {i+1} ---")
                    print(f"Generated in {result.generation_time:.2f}s")
                    print(f"Valid: {result.is_valid}")
                    
                    if result.error_message:
                        print(f"Error: {result.error_message}")
                    
                    print("Generated LODA code:")
                    print(result.generated_code)
                    
                    if result.generated_sequence:
                        print(f"Sequence terms: {result.generated_sequence}")
                    
                    print("-" * 50)
            
            except KeyboardInterrupt:
                print("\nGoodbye!")
                break
            except Exception as e:
                print(f"Error: {e}")

Generator class for creating LODA code from natural language.

Initialize the generator.

Args

model
Trained LodaT5Model
max_length
Maximum length of generated code
num_beams
Number of beams for beam search

Methods

def generate(self, description: str, num_samples: int = 1) ‑> List[GenerationResult]
Expand source code
def generate(self, description: str, num_samples: int = 1) -> List[GenerationResult]:
    """
    Generate LODA code from a natural language description.
    
    Args:
        description: Natural language description of the sequence
        num_samples: Number of code samples to generate
        
    Returns:
        List of GenerationResult objects
    """
    start_time = time.time()
    
    # Generate multiple samples
    descriptions = [description] * num_samples
    generated_codes = self.model.generate(
        descriptions, 
        max_length=self.max_length,
        num_beams=self.num_beams
    )
    
    generation_time = time.time() - start_time
    
    results = []
    for code in generated_codes:
        result = self._validate_and_evaluate_code(description, code)
        result.generation_time = generation_time / num_samples
        results.append(result)
    
    return results

Generate LODA code from a natural language description.

Args

description
Natural language description of the sequence
num_samples
Number of code samples to generate

Returns

List of GenerationResult objects

def generate_interactive(self)
Expand source code
def generate_interactive(self):
    """Interactive mode for generating LODA code."""
    print("LODA Code Generator - Interactive Mode")
    print("Enter natural language descriptions to generate LODA code.")
    print("Type 'quit' to exit.\n")
    
    while True:
        try:
            description = input("Description: ").strip()
            
            if description.lower() in ['quit', 'exit', 'q']:
                print("Goodbye!")
                break
            
            if not description:
                continue
            
            print("Generating code...")
            results = self.generate(description, num_samples=1)
            
            for i, result in enumerate(results):
                print(f"\n--- Result {i+1} ---")
                print(f"Generated in {result.generation_time:.2f}s")
                print(f"Valid: {result.is_valid}")
                
                if result.error_message:
                    print(f"Error: {result.error_message}")
                
                print("Generated LODA code:")
                print(result.generated_code)
                
                if result.generated_sequence:
                    print(f"Sequence terms: {result.generated_sequence}")
                
                print("-" * 50)
        
        except KeyboardInterrupt:
            print("\nGoodbye!")
            break
        except Exception as e:
            print(f"Error: {e}")

Interactive mode for generating LODA code.

class LodaT5Model (model_name: str = 't5-small', loda_vocab_size: int | None = None)
Expand source code
class LodaT5Model(nn.Module):
    """
    T5-based model for natural language to LODA code generation.
    """
    
    def __init__(self, model_name: str = "t5-small", loda_vocab_size: Optional[int] = None):
        """
        Initialize the model.
        
        Args:
            model_name: Base T5 model to use
            loda_vocab_size: Size of LODA vocabulary (if extending tokenizer)
        """
        super().__init__()
        
        # Load base T5 model and tokenizer
        self.text_tokenizer = T5Tokenizer.from_pretrained(model_name)
        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
        
        # Initialize LODA tokenizer
        self.loda_tokenizer = LodaTokenizer()
        
        # If we need to extend the vocabulary
        if loda_vocab_size and loda_vocab_size > self.loda_tokenizer.vocab_size:
            # Could extend vocabulary here if needed
            pass
    
    def prepare_input(self, descriptions: List[str]) -> Dict[str, torch.Tensor]:
        """
        Prepare natural language descriptions for input.
        
        Args:
            descriptions: List of natural language descriptions
            
        Returns:
            Dictionary with input tensors
        """
        # Add task prefix for T5
        prefixed_descriptions = [f"translate to loda: {desc}" for desc in descriptions]
        
        # Tokenize with T5 tokenizer
        encoded = self.text_tokenizer(
            prefixed_descriptions,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        
        return encoded
    
    def prepare_target(self, loda_codes: List[str]) -> Dict[str, torch.Tensor]:
        """
        Prepare LODA codes as targets.
        
        Args:
            loda_codes: List of LODA assembly codes
            
        Returns:
            Dictionary with target tensors
        """
        # For T5, we need to encode targets using the text tokenizer as well
        # We'll create a custom format that represents LODA code
        
        # Convert LODA to a text representation that T5 can understand
        text_loda_codes = []
        for code in loda_codes:
            # Convert LODA code to a more text-like format
            text_code = self.loda_to_text_format(code)
            text_loda_codes.append(text_code)
        
        encoded = self.text_tokenizer(
            text_loda_codes,
            padding=True,
            truncation=True,
            max_length=256,
            return_tensors="pt"
        )
        
        return encoded
    
    def loda_to_text_format(self, code: str) -> str:
        """
        Convert LODA code to a text format suitable for T5.
        
        This creates a more natural language representation of LODA code.
        
        Args:
            code: LODA assembly code
            
        Returns:
            Text representation of the code
        """
        lines = code.strip().split('\n')
        text_parts = []
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Parse the line and convert to text
            parts = line.replace(',', ' ').split()
            if len(parts) >= 3:
                op, target, source = parts[0], parts[1], parts[2]
                text_parts.append(f"{op} {target} {source}")
            elif len(parts) >= 2:
                op, target = parts[0], parts[1]
                text_parts.append(f"{op} {target}")
            else:
                text_parts.append(line)
        
        return " | ".join(text_parts)
    
    def text_format_to_loda(self, text_code: str) -> str:
        """
        Convert text format back to LODA code.
        
        Args:
            text_code: Text representation of LODA code
            
        Returns:
            LODA assembly code
        """
        parts = text_code.split(" | ")
        loda_lines = []
        
        for part in parts:
            part = part.strip()
            if not part:
                continue
            
            tokens = part.split()
            if len(tokens) >= 3:
                op, target, source = tokens[0], tokens[1], tokens[2]
                loda_lines.append(f"{op} {target},{source}")
            elif len(tokens) >= 2:
                op, target = tokens[0], tokens[1]
                loda_lines.append(f"{op} {target}")
            else:
                loda_lines.append(part)
        
        return '\n'.join(loda_lines)
    
    def forward(self, input_ids, attention_mask, labels=None):
        """
        Forward pass of the model.
        
        Args:
            input_ids: Input token IDs
            attention_mask: Attention mask
            labels: Target labels (for training)
            
        Returns:
            Model outputs
        """
        return self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
    
    def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) -> List[str]:
        """
        Generate LODA code from natural language descriptions.
        
        Args:
            descriptions: List of natural language descriptions
            max_length: Maximum length of generated sequences
            num_beams: Number of beams for beam search
            
        Returns:
            List of generated LODA codes
        """
        # Prepare input
        inputs = self.prepare_input(descriptions)
        
        # Generate with the model
        with torch.no_grad():
            generated_ids = self.model.generate(
                input_ids=inputs['input_ids'],
                attention_mask=inputs['attention_mask'],
                max_length=max_length,
                num_beams=num_beams,
                early_stopping=True,
                do_sample=False
            )
        
        # Decode generated sequences
        generated_texts = self.text_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        
        # Convert from text format back to LODA
        loda_codes = [self.text_format_to_loda(text) for text in generated_texts]
        
        return loda_codes
    
    def save_model(self, save_path: str):
        """
        Save the model and tokenizers.
        
        Args:
            save_path: Directory to save the model
        """
        os.makedirs(save_path, exist_ok=True)
        
        # Save T5 model and tokenizer
        self.model.save_pretrained(save_path)
        self.text_tokenizer.save_pretrained(save_path)
        
        # Save LODA tokenizer
        loda_tokenizer_path = os.path.join(save_path, "loda_tokenizer.json")
        with open(loda_tokenizer_path, 'w') as f:
            json.dump({
                'vocab': self.loda_tokenizer.vocab,
                'reverse_vocab': {str(k): v for k, v in self.loda_tokenizer.reverse_vocab.items()}
            }, f, indent=2)
    
    @classmethod
    def load_model(cls, load_path: str):
        """
        Load a saved model.
        
        Args:
            load_path: Directory containing the saved model
            
        Returns:
            Loaded LodaT5Model instance
        """
        # Load T5 model and tokenizer
        model = T5ForConditionalGeneration.from_pretrained(load_path)
        text_tokenizer = T5Tokenizer.from_pretrained(load_path)
        
        # Create model instance
        loda_model = cls()
        loda_model.model = model
        loda_model.text_tokenizer = text_tokenizer
        
        # Load LODA tokenizer if it exists
        loda_tokenizer_path = os.path.join(load_path, "loda_tokenizer.json")
        if os.path.exists(loda_tokenizer_path):
            with open(loda_tokenizer_path, 'r') as f:
                tokenizer_data = json.load(f)
            
            loda_model.loda_tokenizer.vocab = tokenizer_data['vocab']
            loda_model.loda_tokenizer.reverse_vocab = {
                int(k): v for k, v in tokenizer_data['reverse_vocab'].items()
            }
        
        return loda_model

T5-based model for natural language to LODA code generation.

Initialize the model.

Args

model_name
Base T5 model to use
loda_vocab_size
Size of LODA vocabulary (if extending tokenizer)

Ancestors

  • torch.nn.modules.module.Module

Static methods

def load_model(load_path: str)

Load a saved model.

Args

load_path
Directory containing the saved model

Returns

Loaded LodaT5Model instance

Methods

def forward(self, input_ids, attention_mask, labels=None) ‑> Callable[..., Any]
Expand source code
def forward(self, input_ids, attention_mask, labels=None):
    """
    Forward pass of the model.
    
    Args:
        input_ids: Input token IDs
        attention_mask: Attention mask
        labels: Target labels (for training)
        
    Returns:
        Model outputs
    """
    return self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )

Forward pass of the model.

Args

input_ids
Input token IDs
attention_mask
Attention mask
labels
Target labels (for training)

Returns

Model outputs

def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) ‑> List[str]
Expand source code
def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) -> List[str]:
    """
    Generate LODA code from natural language descriptions.
    
    Args:
        descriptions: List of natural language descriptions
        max_length: Maximum length of generated sequences
        num_beams: Number of beams for beam search
        
    Returns:
        List of generated LODA codes
    """
    # Prepare input
    inputs = self.prepare_input(descriptions)
    
    # Generate with the model
    with torch.no_grad():
        generated_ids = self.model.generate(
            input_ids=inputs['input_ids'],
            attention_mask=inputs['attention_mask'],
            max_length=max_length,
            num_beams=num_beams,
            early_stopping=True,
            do_sample=False
        )
    
    # Decode generated sequences
    generated_texts = self.text_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    
    # Convert from text format back to LODA
    loda_codes = [self.text_format_to_loda(text) for text in generated_texts]
    
    return loda_codes

Generate LODA code from natural language descriptions.

Args

descriptions
List of natural language descriptions
max_length
Maximum length of generated sequences
num_beams
Number of beams for beam search

Returns

List of generated LODA codes

def loda_to_text_format(self, code: str) ‑> str
Expand source code
def loda_to_text_format(self, code: str) -> str:
    """
    Convert LODA code to a text format suitable for T5.
    
    This creates a more natural language representation of LODA code.
    
    Args:
        code: LODA assembly code
        
    Returns:
        Text representation of the code
    """
    lines = code.strip().split('\n')
    text_parts = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Parse the line and convert to text
        parts = line.replace(',', ' ').split()
        if len(parts) >= 3:
            op, target, source = parts[0], parts[1], parts[2]
            text_parts.append(f"{op} {target} {source}")
        elif len(parts) >= 2:
            op, target = parts[0], parts[1]
            text_parts.append(f"{op} {target}")
        else:
            text_parts.append(line)
    
    return " | ".join(text_parts)

Convert LODA code to a text format suitable for T5.

This creates a more natural language representation of LODA code.

Args

code
LODA assembly code

Returns

Text representation of the code

def prepare_input(self, descriptions: List[str]) ‑> Dict[str, torch.Tensor]
Expand source code
def prepare_input(self, descriptions: List[str]) -> Dict[str, torch.Tensor]:
    """
    Prepare natural language descriptions for input.
    
    Args:
        descriptions: List of natural language descriptions
        
    Returns:
        Dictionary with input tensors
    """
    # Add task prefix for T5
    prefixed_descriptions = [f"translate to loda: {desc}" for desc in descriptions]
    
    # Tokenize with T5 tokenizer
    encoded = self.text_tokenizer(
        prefixed_descriptions,
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt"
    )
    
    return encoded

Prepare natural language descriptions for input.

Args

descriptions
List of natural language descriptions

Returns

Dictionary with input tensors

def prepare_target(self, loda_codes: List[str]) ‑> Dict[str, torch.Tensor]
Expand source code
def prepare_target(self, loda_codes: List[str]) -> Dict[str, torch.Tensor]:
    """
    Prepare LODA codes as targets.
    
    Args:
        loda_codes: List of LODA assembly codes
        
    Returns:
        Dictionary with target tensors
    """
    # For T5, we need to encode targets using the text tokenizer as well
    # We'll create a custom format that represents LODA code
    
    # Convert LODA to a text representation that T5 can understand
    text_loda_codes = []
    for code in loda_codes:
        # Convert LODA code to a more text-like format
        text_code = self.loda_to_text_format(code)
        text_loda_codes.append(text_code)
    
    encoded = self.text_tokenizer(
        text_loda_codes,
        padding=True,
        truncation=True,
        max_length=256,
        return_tensors="pt"
    )
    
    return encoded

Prepare LODA codes as targets.

Args

loda_codes
List of LODA assembly codes

Returns

Dictionary with target tensors

def save_model(self, save_path: str)
Expand source code
def save_model(self, save_path: str):
    """
    Save the model and tokenizers.
    
    Args:
        save_path: Directory to save the model
    """
    os.makedirs(save_path, exist_ok=True)
    
    # Save T5 model and tokenizer
    self.model.save_pretrained(save_path)
    self.text_tokenizer.save_pretrained(save_path)
    
    # Save LODA tokenizer
    loda_tokenizer_path = os.path.join(save_path, "loda_tokenizer.json")
    with open(loda_tokenizer_path, 'w') as f:
        json.dump({
            'vocab': self.loda_tokenizer.vocab,
            'reverse_vocab': {str(k): v for k, v in self.loda_tokenizer.reverse_vocab.items()}
        }, f, indent=2)

Save the model and tokenizers.

Args

save_path
Directory to save the model
def text_format_to_loda(self, text_code: str) ‑> str
Expand source code
def text_format_to_loda(self, text_code: str) -> str:
    """
    Convert text format back to LODA code.
    
    Args:
        text_code: Text representation of LODA code
        
    Returns:
        LODA assembly code
    """
    parts = text_code.split(" | ")
    loda_lines = []
    
    for part in parts:
        part = part.strip()
        if not part:
            continue
        
        tokens = part.split()
        if len(tokens) >= 3:
            op, target, source = tokens[0], tokens[1], tokens[2]
            loda_lines.append(f"{op} {target},{source}")
        elif len(tokens) >= 2:
            op, target = tokens[0], tokens[1]
            loda_lines.append(f"{op} {target}")
        else:
            loda_lines.append(part)
    
    return '\n'.join(loda_lines)

Convert text format back to LODA code.

Args

text_code
Text representation of LODA code

Returns

LODA assembly code

class LodaTokenizer
Expand source code
class LodaTokenizer:
    """Custom tokenizer for LODA assembly language."""
    
    def __init__(self):
        """Initialize LODA tokenizer with vocabulary."""
        # LODA operations
        self.operations = [
            'mov', 'add', 'sub', 'mul', 'div', 'dif', 'mod', 'pow', 'gcd', 'bin',
            'cmp', 'min', 'max', 'lpb', 'lpe', 'nop', 'cal', 'seq', 'trn', 'clr'
        ]
        
        # Common operand patterns
        self.operand_patterns = [
            # Direct memory references
            '$0', '$1', '$2', '$3', '$4', '$5', '$6', '$7', '$8', '$9', '$10',
            # Indirect memory references  
            '$$1', '$$2', '$$3', '$$4', '$$5',
            # Common constants
            '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '-1'
        ]
        
        # Special tokens
        self.special_tokens = ['<pad>', '<unk>', '<s>', '</s>', '<mask>']
        
        # Build vocabulary
        self.vocab = {}
        self.reverse_vocab = {}
        
        # Add special tokens first
        for i, token in enumerate(self.special_tokens):
            self.vocab[token] = i
            self.reverse_vocab[i] = token
        
        # Add operations
        for token in self.operations:
            idx = len(self.vocab)
            self.vocab[token] = idx
            self.reverse_vocab[idx] = token
        
        # Add operand patterns
        for token in self.operand_patterns:
            idx = len(self.vocab)
            self.vocab[token] = idx
            self.reverse_vocab[idx] = token
        
        self.vocab_size = len(self.vocab)
        self.pad_token_id = self.vocab['<pad>']
        self.unk_token_id = self.vocab['<unk>']
        self.bos_token_id = self.vocab['<s>']
        self.eos_token_id = self.vocab['</s>']
    
    def tokenize_loda_code(self, code: str) -> List[str]:
        """
        Tokenize LODA assembly code.
        
        Args:
            code: LODA assembly code as string
            
        Returns:
            List of tokens
        """
        lines = code.strip().split('\n')
        tokens = ['<s>']  # Start token
        
        for line in lines:
            line = line.strip()
            if not line:
                continue
            
            # Split on whitespace and comma
            parts = line.replace(',', ' ').split()
            
            for part in parts:
                part = part.strip()
                if part in self.vocab:
                    tokens.append(part)
                else:
                    # Try to handle unknown operands
                    if part.startswith('$') and part[1:].isdigit():
                        # Direct memory reference
                        if part in self.vocab:
                            tokens.append(part)
                        else:
                            tokens.append('<unk>')
                    elif part.startswith('$$') and part[2:].isdigit():
                        # Indirect memory reference
                        if part in self.vocab:
                            tokens.append(part)
                        else:
                            tokens.append('<unk>')
                    elif part.lstrip('-').isdigit():
                        # Numeric constant
                        if part in self.vocab:
                            tokens.append(part)
                        else:
                            tokens.append('<unk>')
                    else:
                        tokens.append('<unk>')
        
        tokens.append('</s>')  # End token
        return tokens
    
    def encode_loda_code(self, code: str) -> List[int]:
        """
        Encode LODA code to token IDs.
        
        Args:
            code: LODA assembly code
            
        Returns:
            List of token IDs
        """
        tokens = self.tokenize_loda_code(code)
        return [self.vocab.get(token, self.unk_token_id) for token in tokens]
    
    def decode_loda_code(self, token_ids: List[int]) -> str:
        """
        Decode token IDs back to LODA code.
        
        Args:
            token_ids: List of token IDs
            
        Returns:
            LODA assembly code as string
        """
        tokens = [self.reverse_vocab.get(id, '<unk>') for id in token_ids]
        
        # Filter out special tokens
        filtered_tokens = []
        for token in tokens:
            if token in ['<s>', '</s>', '<pad>']:
                continue
            if token == '<unk>':
                continue
            filtered_tokens.append(token)
        
        # Reconstruct LODA code
        code_lines = []
        i = 0
        
        while i < len(filtered_tokens):
            if i + 2 < len(filtered_tokens):
                # Try to form operation: op target source
                op = filtered_tokens[i]
                if op in self.operations and i + 2 < len(filtered_tokens):
                    target = filtered_tokens[i + 1]
                    source = filtered_tokens[i + 2]
                    code_lines.append(f"{op} {target},{source}")
                    i += 3
                elif op in self.operations and i + 1 < len(filtered_tokens):
                    # Single operand operation
                    target = filtered_tokens[i + 1]
                    code_lines.append(f"{op} {target}")
                    i += 2
                else:
                    i += 1
            else:
                i += 1
        
        return '\n'.join(code_lines)

Custom tokenizer for LODA assembly language.

Initialize LODA tokenizer with vocabulary.

Methods

def decode_loda_code(self, token_ids: List[int]) ‑> str
Expand source code
def decode_loda_code(self, token_ids: List[int]) -> str:
    """
    Decode token IDs back to LODA code.
    
    Args:
        token_ids: List of token IDs
        
    Returns:
        LODA assembly code as string
    """
    tokens = [self.reverse_vocab.get(id, '<unk>') for id in token_ids]
    
    # Filter out special tokens
    filtered_tokens = []
    for token in tokens:
        if token in ['<s>', '</s>', '<pad>']:
            continue
        if token == '<unk>':
            continue
        filtered_tokens.append(token)
    
    # Reconstruct LODA code
    code_lines = []
    i = 0
    
    while i < len(filtered_tokens):
        if i + 2 < len(filtered_tokens):
            # Try to form operation: op target source
            op = filtered_tokens[i]
            if op in self.operations and i + 2 < len(filtered_tokens):
                target = filtered_tokens[i + 1]
                source = filtered_tokens[i + 2]
                code_lines.append(f"{op} {target},{source}")
                i += 3
            elif op in self.operations and i + 1 < len(filtered_tokens):
                # Single operand operation
                target = filtered_tokens[i + 1]
                code_lines.append(f"{op} {target}")
                i += 2
            else:
                i += 1
        else:
            i += 1
    
    return '\n'.join(code_lines)

Decode token IDs back to LODA code.

Args

token_ids
List of token IDs

Returns

LODA assembly code as string

def encode_loda_code(self, code: str) ‑> List[int]
Expand source code
def encode_loda_code(self, code: str) -> List[int]:
    """
    Encode LODA code to token IDs.
    
    Args:
        code: LODA assembly code
        
    Returns:
        List of token IDs
    """
    tokens = self.tokenize_loda_code(code)
    return [self.vocab.get(token, self.unk_token_id) for token in tokens]

Encode LODA code to token IDs.

Args

code
LODA assembly code

Returns

List of token IDs

def tokenize_loda_code(self, code: str) ‑> List[str]
Expand source code
def tokenize_loda_code(self, code: str) -> List[str]:
    """
    Tokenize LODA assembly code.
    
    Args:
        code: LODA assembly code as string
        
    Returns:
        List of tokens
    """
    lines = code.strip().split('\n')
    tokens = ['<s>']  # Start token
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
        
        # Split on whitespace and comma
        parts = line.replace(',', ' ').split()
        
        for part in parts:
            part = part.strip()
            if part in self.vocab:
                tokens.append(part)
            else:
                # Try to handle unknown operands
                if part.startswith('$') and part[1:].isdigit():
                    # Direct memory reference
                    if part in self.vocab:
                        tokens.append(part)
                    else:
                        tokens.append('<unk>')
                elif part.startswith('$$') and part[2:].isdigit():
                    # Indirect memory reference
                    if part in self.vocab:
                        tokens.append(part)
                    else:
                        tokens.append('<unk>')
                elif part.lstrip('-').isdigit():
                    # Numeric constant
                    if part in self.vocab:
                        tokens.append(part)
                    else:
                        tokens.append('<unk>')
                else:
                    tokens.append('<unk>')
    
    tokens.append('</s>')  # End token
    return tokens

Tokenize LODA assembly code.

Args

code
LODA assembly code as string

Returns

List of tokens

class LodaTrainer (model: LodaT5Model,
train_dataset: LodaDataset,
val_dataset: LodaDataset | None = None,
learning_rate: float = 5e-05,
batch_size: int = 8,
num_epochs: int = 3,
warmup_steps: int = 500,
save_dir: str = 'loda_llm_model')
Expand source code
class LodaTrainer:
    """Trainer class for LODA LLM."""
    
    def __init__(self, 
                 model: LodaT5Model,
                 train_dataset: LodaDataset,
                 val_dataset: Optional[LodaDataset] = None,
                 learning_rate: float = 5e-5,
                 batch_size: int = 8,
                 num_epochs: int = 3,
                 warmup_steps: int = 500,
                 save_dir: str = "loda_llm_model"):
        """
        Initialize the trainer.
        
        Args:
            model: LodaT5Model to train
            train_dataset: Training dataset
            val_dataset: Validation dataset (optional)
            learning_rate: Learning rate
            batch_size: Batch size
            num_epochs: Number of training epochs
            warmup_steps: Number of warmup steps for learning rate schedule
            save_dir: Directory to save the model
        """
        self.model = model
        self.train_dataset = train_dataset
        self.val_dataset = val_dataset
        self.learning_rate = learning_rate
        self.batch_size = batch_size
        self.num_epochs = num_epochs
        self.warmup_steps = warmup_steps
        self.save_dir = save_dir
        
        # Set up device
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.model.to(self.device)
        
        # Set up data loaders
        self.train_loader = DataLoader(
            train_dataset, 
            batch_size=batch_size, 
            shuffle=True,
            collate_fn=self._collate_fn
        )
        
        if val_dataset:
            self.val_loader = DataLoader(
                val_dataset, 
                batch_size=batch_size, 
                shuffle=False,
                collate_fn=self._collate_fn
            )
        
        # Set up optimizer
        self.optimizer = AdamW(
            self.model.model.parameters(),
            lr=learning_rate,
            weight_decay=0.01
        )
        
        # Set up learning rate scheduler
        total_steps = len(self.train_loader) * num_epochs
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=warmup_steps,
            num_training_steps=total_steps
        )
    
    def _collate_fn(self, batch):
        """Collate function for DataLoader."""
        # Pad sequences to the same length
        input_ids = [item['input_ids'] for item in batch]
        attention_masks = [item['attention_mask'] for item in batch]
        labels = [item['labels'] for item in batch]
        decoder_attention_masks = [item['decoder_attention_mask'] for item in batch]
        
        # Pad input sequences
        max_input_len = max(len(seq) for seq in input_ids)
        padded_input_ids = []
        padded_attention_masks = []
        
        for i in range(len(input_ids)):
            seq_len = len(input_ids[i])
            pad_len = max_input_len - seq_len
            
            padded_input_ids.append(
                torch.cat([input_ids[i], torch.zeros(pad_len, dtype=torch.long)])
            )
            padded_attention_masks.append(
                torch.cat([attention_masks[i], torch.zeros(pad_len, dtype=torch.long)])
            )
        
        # Pad target sequences
        max_target_len = max(len(seq) for seq in labels)
        padded_labels = []
        padded_decoder_masks = []
        
        for i in range(len(labels)):
            seq_len = len(labels[i])
            pad_len = max_target_len - seq_len
            
            # For labels, use -100 for padding (ignored in loss calculation)
            padded_labels.append(
                torch.cat([labels[i], torch.full((pad_len,), -100, dtype=torch.long)])
            )
            padded_decoder_masks.append(
                torch.cat([decoder_attention_masks[i], torch.zeros(pad_len, dtype=torch.long)])
            )
        
        return {
            'input_ids': torch.stack(padded_input_ids),
            'attention_mask': torch.stack(padded_attention_masks),
            'labels': torch.stack(padded_labels),
            'decoder_attention_mask': torch.stack(padded_decoder_masks)
        }
    
    def train_epoch(self):
        """Train for one epoch."""
        self.model.model.train()
        total_loss = 0
        
        progress_bar = tqdm(self.train_loader, desc="Training")
        
        for batch in progress_bar:
            # Move to device
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            # Forward pass
            outputs = self.model.forward(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            # Backward pass
            loss.backward()
            
            # Clip gradients
            torch.nn.utils.clip_grad_norm_(self.model.model.parameters(), 1.0)
            
            # Update parameters
            self.optimizer.step()
            self.scheduler.step()
            self.optimizer.zero_grad()
            
            # Update progress bar
            progress_bar.set_postfix({'loss': loss.item()})
        
        return total_loss / len(self.train_loader)
    
    def validate(self):
        """Validate the model."""
        if not self.val_dataset:
            return None
        
        self.model.model.eval()
        total_loss = 0
        
        with torch.no_grad():
            progress_bar = tqdm(self.val_loader, desc="Validation")
            
            for batch in progress_bar:
                # Move to device
                batch = {k: v.to(self.device) for k, v in batch.items()}
                
                # Forward pass
                outputs = self.model.forward(
                    input_ids=batch['input_ids'],
                    attention_mask=batch['attention_mask'],
                    labels=batch['labels']
                )
                
                loss = outputs.loss
                total_loss += loss.item()
                
                progress_bar.set_postfix({'val_loss': loss.item()})
        
        return total_loss / len(self.val_loader)
    
    def train(self):
        """Train the model."""
        print(f"Training on device: {self.device}")
        print(f"Training examples: {len(self.train_dataset)}")
        if self.val_dataset:
            print(f"Validation examples: {len(self.val_dataset)}")
        
        best_val_loss = float('inf')
        
        for epoch in range(self.num_epochs):
            print(f"\nEpoch {epoch + 1}/{self.num_epochs}")
            
            # Train
            train_loss = self.train_epoch()
            print(f"Training loss: {train_loss:.4f}")
            
            # Validate
            val_loss = self.validate()
            if val_loss is not None:
                print(f"Validation loss: {val_loss:.4f}")
                
                # Save best model
                if val_loss < best_val_loss:
                    best_val_loss = val_loss
                    self.save_model(f"{self.save_dir}_best")
                    print("Saved best model")
            
            # Save checkpoint
            self.save_model(f"{self.save_dir}_epoch_{epoch + 1}")
        
        print("\nTraining completed!")
        return self.model
    
    def save_model(self, path: str):
        """Save the model."""
        self.model.save_model(path)

Trainer class for LODA LLM.

Initialize the trainer.

Args

model
LodaT5Model to train
train_dataset
Training dataset
val_dataset
Validation dataset (optional)
learning_rate
Learning rate
batch_size
Batch size
num_epochs
Number of training epochs
warmup_steps
Number of warmup steps for learning rate schedule
save_dir
Directory to save the model

Methods

def save_model(self, path: str)
Expand source code
def save_model(self, path: str):
    """Save the model."""
    self.model.save_model(path)

Save the model.

def train(self)
Expand source code
def train(self):
    """Train the model."""
    print(f"Training on device: {self.device}")
    print(f"Training examples: {len(self.train_dataset)}")
    if self.val_dataset:
        print(f"Validation examples: {len(self.val_dataset)}")
    
    best_val_loss = float('inf')
    
    for epoch in range(self.num_epochs):
        print(f"\nEpoch {epoch + 1}/{self.num_epochs}")
        
        # Train
        train_loss = self.train_epoch()
        print(f"Training loss: {train_loss:.4f}")
        
        # Validate
        val_loss = self.validate()
        if val_loss is not None:
            print(f"Validation loss: {val_loss:.4f}")
            
            # Save best model
            if val_loss < best_val_loss:
                best_val_loss = val_loss
                self.save_model(f"{self.save_dir}_best")
                print("Saved best model")
        
        # Save checkpoint
        self.save_model(f"{self.save_dir}_epoch_{epoch + 1}")
    
    print("\nTraining completed!")
    return self.model

Train the model.

def train_epoch(self)
Expand source code
def train_epoch(self):
    """Train for one epoch."""
    self.model.model.train()
    total_loss = 0
    
    progress_bar = tqdm(self.train_loader, desc="Training")
    
    for batch in progress_bar:
        # Move to device
        batch = {k: v.to(self.device) for k, v in batch.items()}
        
        # Forward pass
        outputs = self.model.forward(
            input_ids=batch['input_ids'],
            attention_mask=batch['attention_mask'],
            labels=batch['labels']
        )
        
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass
        loss.backward()
        
        # Clip gradients
        torch.nn.utils.clip_grad_norm_(self.model.model.parameters(), 1.0)
        
        # Update parameters
        self.optimizer.step()
        self.scheduler.step()
        self.optimizer.zero_grad()
        
        # Update progress bar
        progress_bar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(self.train_loader)

Train for one epoch.

def validate(self)
Expand source code
def validate(self):
    """Validate the model."""
    if not self.val_dataset:
        return None
    
    self.model.model.eval()
    total_loss = 0
    
    with torch.no_grad():
        progress_bar = tqdm(self.val_loader, desc="Validation")
        
        for batch in progress_bar:
            # Move to device
            batch = {k: v.to(self.device) for k, v in batch.items()}
            
            # Forward pass
            outputs = self.model.forward(
                input_ids=batch['input_ids'],
                attention_mask=batch['attention_mask'],
                labels=batch['labels']
            )
            
            loss = outputs.loss
            total_loss += loss.item()
            
            progress_bar.set_postfix({'val_loss': loss.item()})
    
    return total_loss / len(self.val_loader)

Validate the model.

class TrainingExample (sequence_id: str,
description: str,
loda_code: str,
terms: List[int] | None = None)
Expand source code
@dataclass
class TrainingExample:
    """A single training example pairing natural language with LODA code."""
    sequence_id: str
    description: str
    loda_code: str
    terms: Optional[List[int]] = None

A single training example pairing natural language with LODA code.

Instance variables

var description : str
var loda_code : str
var sequence_id : str
var terms : List[int] | None