Module loda.llm
Large Language Model (LLM) implementation for natural language to LODA code generation.
This module provides functionality to train transformer-based models that can understand natural language descriptions of integer sequences (like OEIS sequences) and generate corresponding LODA assembly programs.
Key components: - Data preprocessing for OEIS sequence descriptions and LODA programs - Transformer-based encoder-decoder architecture - Training pipeline with proper tokenization - Inference utilities for code generation - Evaluation metrics for generated programs
Example usage:
>>> from loda.llm import LodaT5Model, LodaGenerator, train_loda_llm
>>>
>>> # Train a model
>>> model = train_loda_llm("programs/oeis", "trained_model")
>>>
>>> # Generate code
>>> generator = LodaGenerator(model)
>>> results = generator.generate("Fibonacci numbers")
>>> print(results[0].generated_code)
Sub-modules
loda.llm.data_preprocessing-
Data preprocessing utilities for LLM training on OEIS sequences and LODA programs …
loda.llm.inference-
Inference and evaluation utilities for the LODA LLM …
loda.llm.model-
Transformer-based model for natural language to LODA code generation …
loda.llm.trainer-
Training script for the LODA LLM (Large Language Model) …
Functions
def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True)-
Expand source code
def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True): """ Convenience function to create and save a training dataset. Args: programs_dir: Path to OEIS programs directory output_file: Path to save the dataset max_examples: Maximum number of examples (-1 for all) augment: Whether to augment with description variations """ preprocessor = DataPreprocessor(programs_dir) examples = preprocessor.create_training_examples(max_examples) if augment: examples = preprocessor.augment_descriptions(examples) preprocessor.save_dataset(examples, output_file) return examplesConvenience function to create and save a training dataset.
Args
programs_dir- Path to OEIS programs directory
output_file- Path to save the dataset
max_examples- Maximum number of examples (-1 for all)
augment- Whether to augment with description variations
def train_loda_llm(programs_dir: str,
output_dir: str = 'loda_llm_model',
model_name: str = 't5-small',
max_examples: int = -1,
val_split: float = 0.1,
batch_size: int = 8,
learning_rate: float = 5e-05,
num_epochs: int = 3)-
Expand source code
def train_loda_llm(programs_dir: str, output_dir: str = "loda_llm_model", model_name: str = "t5-small", max_examples: int = -1, val_split: float = 0.1, batch_size: int = 8, learning_rate: float = 5e-5, num_epochs: int = 3): """ Main training function. Args: programs_dir: Directory containing OEIS programs output_dir: Directory to save the trained model model_name: Base T5 model to use max_examples: Maximum number of training examples (-1 for all) val_split: Fraction of data to use for validation batch_size: Training batch size learning_rate: Learning rate num_epochs: Number of training epochs """ print("Preparing training data...") # Create training examples preprocessor = DataPreprocessor(programs_dir) examples = preprocessor.create_training_examples(max_examples) if len(examples) == 0: print("No training examples found!") return None # Augment examples print("Augmenting training examples...") examples = preprocessor.augment_descriptions(examples) # Split into train/validation if val_split > 0: split_idx = int(len(examples) * (1 - val_split)) train_examples = examples[:split_idx] val_examples = examples[split_idx:] else: train_examples = examples val_examples = None print(f"Training examples: {len(train_examples)}") if val_examples: print(f"Validation examples: {len(val_examples)}") # Create model print(f"Creating model based on {model_name}...") model = LodaT5Model(model_name) # Create datasets train_dataset = LodaDataset(train_examples, model) val_dataset = LodaDataset(val_examples, model) if val_examples else None # Create trainer trainer = LodaTrainer( model=model, train_dataset=train_dataset, val_dataset=val_dataset, learning_rate=learning_rate, batch_size=batch_size, num_epochs=num_epochs, save_dir=output_dir ) # Train the model trained_model = trainer.train() # Save final model trained_model.save_model(output_dir) print(f"Final model saved to {output_dir}") return trained_modelMain training function.
Args
programs_dir- Directory containing OEIS programs
output_dir- Directory to save the trained model
model_name- Base T5 model to use
max_examples- Maximum number of training examples (-1 for all)
val_split- Fraction of data to use for validation
batch_size- Training batch size
learning_rate- Learning rate
num_epochs- Number of training epochs
Classes
class DataPreprocessor (programs_dir: str)-
Expand source code
class DataPreprocessor: """Handles preprocessing of OEIS programs for LLM training.""" def __init__(self, programs_dir: str): """Initialize with path to OEIS programs directory.""" self.programs_dir = programs_dir self.program_cache = ProgramCache(programs_dir) def extract_description_from_program(self, program_text: str) -> Optional[str]: """ Extract the natural language description from a LODA program. LODA programs typically start with comments like: ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1. Args: program_text: The full LODA program as text Returns: The description string or None if no description found """ lines = program_text.strip().split('\n') for line in lines: # Look for OEIS description lines (start with ; A######:) match = re.match(r';\s*A\d{6}:\s*(.+)', line) if match: description = match.group(1).strip() # Clean up common artifacts description = description.rstrip('.') # Remove mathematical notation that might be confusing # Keep it simple for initial training return description return None def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]: """ Extract the sequence terms from a LODA program comment. Args: program_text: The full LODA program as text Returns: List of sequence terms or None if not found """ lines = program_text.strip().split('\n') for line in lines: # Look for lines with comma-separated numbers (sequence terms) if line.startswith(';') and ',' in line: # Extract numbers from the line numbers_str = line[1:].strip() # Remove the ';' # Skip if it looks like it contains non-numeric content if ':' in numbers_str or any(c.isalpha() for c in numbers_str): continue try: terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()] if len(terms) >= 5: # Reasonable number of terms return terms except ValueError: continue return None def clean_loda_code(self, program_text: str) -> str: """ Clean LODA code by removing comments and normalizing format. Args: program_text: Raw LODA program text Returns: Cleaned LODA code suitable for training """ lines = program_text.strip().split('\n') code_lines = [] for line in lines: # Skip comment lines (lines that start with ;) if line.strip().startswith(';'): continue # Skip empty lines if not line.strip(): continue # Remove inline comments (everything after ; on the same line) if ';' in line: code_part = line.split(';')[0].strip() else: code_part = line.strip() # Only add non-empty code lines if code_part: code_lines.append(code_part) return '\n'.join(code_lines) def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]: """ Create training examples from all available LODA programs. Args: max_examples: Maximum number of examples to create (-1 for all) Returns: List of TrainingExample objects """ examples = [] program_ids = self.program_cache.all_ids() if max_examples > 0: program_ids = program_ids[:max_examples] print(f"Processing {len(program_ids)} programs...") for i, program_id in enumerate(program_ids): if i % 1000 == 0: print(f"Processed {i}/{len(program_ids)} programs") try: # Read the program file program_path = self.program_cache.path(program_id) if not os.path.exists(program_path): continue with open(program_path, 'r') as f: program_text = f.read() # Extract description description = self.extract_description_from_program(program_text) if not description: continue # Extract terms (optional) terms = self.extract_terms_from_program(program_text) # Clean the LODA code clean_code = self.clean_loda_code(program_text) if not clean_code: continue # Validate that the code parses correctly try: Program(clean_code) except Exception: continue # Skip programs that don't parse example = TrainingExample( sequence_id=str(program_id), description=description, loda_code=clean_code, terms=terms ) examples.append(example) except Exception as e: print(f"Error processing {program_id}: {e}") continue print(f"Created {len(examples)} training examples") return examples def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]: """ Augment training examples with variations of descriptions. This can help make the model more robust to different phrasings. Args: examples: List of original training examples Returns: Augmented list with additional variations """ augmented = list(examples) # Start with originals for example in examples: desc = example.description # Create variations variations = [] # Add "sequence of" prefix if not present if not desc.lower().startswith(('sequence', 'the sequence')): variations.append(f"Sequence of {desc.lower()}") # Add "Generate" prefix variations.append(f"Generate {desc.lower()}") # Add "Compute" prefix variations.append(f"Compute {desc.lower()}") # Remove mathematical symbols for simpler versions simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc) simple_desc = re.sub(r'\s+', ' ', simple_desc).strip() if simple_desc != desc and simple_desc: variations.append(simple_desc) # Create new examples for each variation for variation in variations: augmented_example = TrainingExample( sequence_id=str(example.sequence_id) + "_aug", description=variation, loda_code=example.loda_code, terms=example.terms ) augmented.append(augmented_example) return augmented def save_dataset(self, examples: List[TrainingExample], output_file: str): """ Save training examples to a file for later use. Args: examples: List of training examples output_file: Path to output file """ import json data = [] for example in examples: data.append({ 'sequence_id': example.sequence_id, 'description': example.description, 'loda_code': example.loda_code, 'terms': example.terms }) with open(output_file, 'w') as f: json.dump(data, f, indent=2) print(f"Saved {len(examples)} examples to {output_file}") def load_dataset(self, input_file: str) -> List[TrainingExample]: """ Load training examples from a file. Args: input_file: Path to input file Returns: List of TrainingExample objects """ import json with open(input_file, 'r') as f: data = json.load(f) examples = [] for item in data: example = TrainingExample( sequence_id=item['sequence_id'], description=item['description'], loda_code=item['loda_code'], terms=item.get('terms') ) examples.append(example) print(f"Loaded {len(examples)} examples from {input_file}") return examplesHandles preprocessing of OEIS programs for LLM training.
Initialize with path to OEIS programs directory.
Methods
def augment_descriptions(self,
examples: List[TrainingExample]) ‑> List[TrainingExample]-
Expand source code
def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]: """ Augment training examples with variations of descriptions. This can help make the model more robust to different phrasings. Args: examples: List of original training examples Returns: Augmented list with additional variations """ augmented = list(examples) # Start with originals for example in examples: desc = example.description # Create variations variations = [] # Add "sequence of" prefix if not present if not desc.lower().startswith(('sequence', 'the sequence')): variations.append(f"Sequence of {desc.lower()}") # Add "Generate" prefix variations.append(f"Generate {desc.lower()}") # Add "Compute" prefix variations.append(f"Compute {desc.lower()}") # Remove mathematical symbols for simpler versions simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc) simple_desc = re.sub(r'\s+', ' ', simple_desc).strip() if simple_desc != desc and simple_desc: variations.append(simple_desc) # Create new examples for each variation for variation in variations: augmented_example = TrainingExample( sequence_id=str(example.sequence_id) + "_aug", description=variation, loda_code=example.loda_code, terms=example.terms ) augmented.append(augmented_example) return augmentedAugment training examples with variations of descriptions.
This can help make the model more robust to different phrasings.
Args
examples- List of original training examples
Returns
Augmented list with additional variations
def clean_loda_code(self, program_text: str) ‑> str-
Expand source code
def clean_loda_code(self, program_text: str) -> str: """ Clean LODA code by removing comments and normalizing format. Args: program_text: Raw LODA program text Returns: Cleaned LODA code suitable for training """ lines = program_text.strip().split('\n') code_lines = [] for line in lines: # Skip comment lines (lines that start with ;) if line.strip().startswith(';'): continue # Skip empty lines if not line.strip(): continue # Remove inline comments (everything after ; on the same line) if ';' in line: code_part = line.split(';')[0].strip() else: code_part = line.strip() # Only add non-empty code lines if code_part: code_lines.append(code_part) return '\n'.join(code_lines)Clean LODA code by removing comments and normalizing format.
Args
program_text- Raw LODA program text
Returns
Cleaned LODA code suitable for training
def create_training_examples(self, max_examples: int = -1) ‑> List[TrainingExample]-
Expand source code
def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]: """ Create training examples from all available LODA programs. Args: max_examples: Maximum number of examples to create (-1 for all) Returns: List of TrainingExample objects """ examples = [] program_ids = self.program_cache.all_ids() if max_examples > 0: program_ids = program_ids[:max_examples] print(f"Processing {len(program_ids)} programs...") for i, program_id in enumerate(program_ids): if i % 1000 == 0: print(f"Processed {i}/{len(program_ids)} programs") try: # Read the program file program_path = self.program_cache.path(program_id) if not os.path.exists(program_path): continue with open(program_path, 'r') as f: program_text = f.read() # Extract description description = self.extract_description_from_program(program_text) if not description: continue # Extract terms (optional) terms = self.extract_terms_from_program(program_text) # Clean the LODA code clean_code = self.clean_loda_code(program_text) if not clean_code: continue # Validate that the code parses correctly try: Program(clean_code) except Exception: continue # Skip programs that don't parse example = TrainingExample( sequence_id=str(program_id), description=description, loda_code=clean_code, terms=terms ) examples.append(example) except Exception as e: print(f"Error processing {program_id}: {e}") continue print(f"Created {len(examples)} training examples") return examplesCreate training examples from all available LODA programs.
Args
max_examples- Maximum number of examples to create (-1 for all)
Returns
List of TrainingExample objects
def extract_description_from_program(self, program_text: str) ‑> str | None-
Expand source code
def extract_description_from_program(self, program_text: str) -> Optional[str]: """ Extract the natural language description from a LODA program. LODA programs typically start with comments like: ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1. Args: program_text: The full LODA program as text Returns: The description string or None if no description found """ lines = program_text.strip().split('\n') for line in lines: # Look for OEIS description lines (start with ; A######:) match = re.match(r';\s*A\d{6}:\s*(.+)', line) if match: description = match.group(1).strip() # Clean up common artifacts description = description.rstrip('.') # Remove mathematical notation that might be confusing # Keep it simple for initial training return description return NoneExtract the natural language description from a LODA program.
LODA programs typically start with comments like: ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
Args
program_text- The full LODA program as text
Returns
The description string or None if no description found
def extract_terms_from_program(self, program_text: str) ‑> List[int] | None-
Expand source code
def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]: """ Extract the sequence terms from a LODA program comment. Args: program_text: The full LODA program as text Returns: List of sequence terms or None if not found """ lines = program_text.strip().split('\n') for line in lines: # Look for lines with comma-separated numbers (sequence terms) if line.startswith(';') and ',' in line: # Extract numbers from the line numbers_str = line[1:].strip() # Remove the ';' # Skip if it looks like it contains non-numeric content if ':' in numbers_str or any(c.isalpha() for c in numbers_str): continue try: terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()] if len(terms) >= 5: # Reasonable number of terms return terms except ValueError: continue return NoneExtract the sequence terms from a LODA program comment.
Args
program_text- The full LODA program as text
Returns
List of sequence terms or None if not found
def load_dataset(self, input_file: str) ‑> List[TrainingExample]-
Expand source code
def load_dataset(self, input_file: str) -> List[TrainingExample]: """ Load training examples from a file. Args: input_file: Path to input file Returns: List of TrainingExample objects """ import json with open(input_file, 'r') as f: data = json.load(f) examples = [] for item in data: example = TrainingExample( sequence_id=item['sequence_id'], description=item['description'], loda_code=item['loda_code'], terms=item.get('terms') ) examples.append(example) print(f"Loaded {len(examples)} examples from {input_file}") return examplesLoad training examples from a file.
Args
input_file- Path to input file
Returns
List of TrainingExample objects
def save_dataset(self,
examples: List[TrainingExample],
output_file: str)-
Expand source code
def save_dataset(self, examples: List[TrainingExample], output_file: str): """ Save training examples to a file for later use. Args: examples: List of training examples output_file: Path to output file """ import json data = [] for example in examples: data.append({ 'sequence_id': example.sequence_id, 'description': example.description, 'loda_code': example.loda_code, 'terms': example.terms }) with open(output_file, 'w') as f: json.dump(data, f, indent=2) print(f"Saved {len(examples)} examples to {output_file}")Save training examples to a file for later use.
Args
examples- List of training examples
output_file- Path to output file
class GenerationResult (description: str,
generated_code: str,
is_valid: bool,
error_message: str | None = None,
generated_sequence: List[int] | None = None,
generation_time: float = 0.0)-
Expand source code
@dataclass class GenerationResult: """Result of code generation.""" description: str generated_code: str is_valid: bool error_message: Optional[str] = None generated_sequence: Optional[List[int]] = None generation_time: float = 0.0Result of code generation.
Instance variables
var description : strvar error_message : str | Nonevar generated_code : strvar generated_sequence : List[int] | Nonevar generation_time : floatvar is_valid : bool
class LodaEvaluator (model: LodaT5Model)-
Expand source code
class LodaEvaluator: """Evaluator for assessing model performance.""" def __init__(self, model: LodaT5Model): """ Initialize the evaluator. Args: model: Trained LodaT5Model to evaluate """ self.model = model self.generator = LodaGenerator(model) def evaluate_examples(self, test_examples: List[TrainingExample]) -> Dict[str, float]: """ Evaluate the model on test examples. Args: test_examples: List of test examples Returns: Dictionary with evaluation metrics """ print(f"Evaluating on {len(test_examples)} examples...") total_examples = len(test_examples) valid_programs = 0 exact_matches = 0 sequence_matches = 0 total_generation_time = 0 results = [] for i, example in enumerate(test_examples): if i % 10 == 0: print(f"Progress: {i}/{total_examples}") # Generate code generation_results = self.generator.generate(example.description, num_samples=1) if generation_results: result = generation_results[0] results.append(result) total_generation_time += result.generation_time if result.is_valid: valid_programs += 1 # Check for exact match if self._normalize_code(result.generated_code) == self._normalize_code(example.loda_code): exact_matches += 1 # Check for sequence match (if we have expected terms) if (example.terms and result.generated_sequence and len(result.generated_sequence) >= 3 and result.generated_sequence[:3] == example.terms[:3]): sequence_matches += 1 # Calculate metrics metrics = { 'total_examples': total_examples, 'valid_program_rate': valid_programs / total_examples if total_examples > 0 else 0, 'exact_match_rate': exact_matches / total_examples if total_examples > 0 else 0, 'sequence_match_rate': sequence_matches / total_examples if total_examples > 0 else 0, 'avg_generation_time': total_generation_time / total_examples if total_examples > 0 else 0, 'valid_programs': valid_programs, 'exact_matches': exact_matches, 'sequence_matches': sequence_matches } return metrics, results def _normalize_code(self, code: str) -> str: """Normalize code for comparison.""" # Remove extra whitespace and normalize format lines = [] for line in code.strip().split('\n'): line = line.strip() if line: lines.append(line) return '\n'.join(lines) def print_evaluation_report(self, metrics: Dict[str, float], results: List[GenerationResult]): """Print a detailed evaluation report.""" print("\n" + "="*60) print("LODA LLM EVALUATION REPORT") print("="*60) print(f"Total Examples: {metrics['total_examples']}") print(f"Valid Programs: {metrics['valid_programs']} ({metrics['valid_program_rate']:.1%})") print(f"Exact Matches: {metrics['exact_matches']} ({metrics['exact_match_rate']:.1%})") print(f"Sequence Matches: {metrics['sequence_matches']} ({metrics['sequence_match_rate']:.1%})") print(f"Avg Generation Time: {metrics['avg_generation_time']:.2f}s") # Show some example results print("\n" + "-"*60) print("SAMPLE RESULTS") print("-"*60) # Show successful examples successful = [r for r in results if r.is_valid] if successful: print("\nSuccessful generations:") for i, result in enumerate(successful[:3]): # Show first 3 print(f"\n{i+1}. Description: {result.description}") print(f" Generated: {result.generated_code.replace(chr(10), '; ')}") if result.generated_sequence: print(f" Sequence: {result.generated_sequence}") # Show failed examples failed = [r for r in results if not r.is_valid] if failed: print(f"\nFailed generations ({len(failed)} total):") for i, result in enumerate(failed[:3]): # Show first 3 print(f"\n{i+1}. Description: {result.description}") print(f" Error: {result.error_message}") print(f" Generated: {result.generated_code.replace(chr(10), '; ')}")Evaluator for assessing model performance.
Initialize the evaluator.
Args
model- Trained LodaT5Model to evaluate
Methods
def evaluate_examples(self,
test_examples: List[TrainingExample]) ‑> Dict[str, float]-
Expand source code
def evaluate_examples(self, test_examples: List[TrainingExample]) -> Dict[str, float]: """ Evaluate the model on test examples. Args: test_examples: List of test examples Returns: Dictionary with evaluation metrics """ print(f"Evaluating on {len(test_examples)} examples...") total_examples = len(test_examples) valid_programs = 0 exact_matches = 0 sequence_matches = 0 total_generation_time = 0 results = [] for i, example in enumerate(test_examples): if i % 10 == 0: print(f"Progress: {i}/{total_examples}") # Generate code generation_results = self.generator.generate(example.description, num_samples=1) if generation_results: result = generation_results[0] results.append(result) total_generation_time += result.generation_time if result.is_valid: valid_programs += 1 # Check for exact match if self._normalize_code(result.generated_code) == self._normalize_code(example.loda_code): exact_matches += 1 # Check for sequence match (if we have expected terms) if (example.terms and result.generated_sequence and len(result.generated_sequence) >= 3 and result.generated_sequence[:3] == example.terms[:3]): sequence_matches += 1 # Calculate metrics metrics = { 'total_examples': total_examples, 'valid_program_rate': valid_programs / total_examples if total_examples > 0 else 0, 'exact_match_rate': exact_matches / total_examples if total_examples > 0 else 0, 'sequence_match_rate': sequence_matches / total_examples if total_examples > 0 else 0, 'avg_generation_time': total_generation_time / total_examples if total_examples > 0 else 0, 'valid_programs': valid_programs, 'exact_matches': exact_matches, 'sequence_matches': sequence_matches } return metrics, resultsEvaluate the model on test examples.
Args
test_examples- List of test examples
Returns
Dictionary with evaluation metrics
def print_evaluation_report(self,
metrics: Dict[str, float],
results: List[GenerationResult])-
Expand source code
def print_evaluation_report(self, metrics: Dict[str, float], results: List[GenerationResult]): """Print a detailed evaluation report.""" print("\n" + "="*60) print("LODA LLM EVALUATION REPORT") print("="*60) print(f"Total Examples: {metrics['total_examples']}") print(f"Valid Programs: {metrics['valid_programs']} ({metrics['valid_program_rate']:.1%})") print(f"Exact Matches: {metrics['exact_matches']} ({metrics['exact_match_rate']:.1%})") print(f"Sequence Matches: {metrics['sequence_matches']} ({metrics['sequence_match_rate']:.1%})") print(f"Avg Generation Time: {metrics['avg_generation_time']:.2f}s") # Show some example results print("\n" + "-"*60) print("SAMPLE RESULTS") print("-"*60) # Show successful examples successful = [r for r in results if r.is_valid] if successful: print("\nSuccessful generations:") for i, result in enumerate(successful[:3]): # Show first 3 print(f"\n{i+1}. Description: {result.description}") print(f" Generated: {result.generated_code.replace(chr(10), '; ')}") if result.generated_sequence: print(f" Sequence: {result.generated_sequence}") # Show failed examples failed = [r for r in results if not r.is_valid] if failed: print(f"\nFailed generations ({len(failed)} total):") for i, result in enumerate(failed[:3]): # Show first 3 print(f"\n{i+1}. Description: {result.description}") print(f" Error: {result.error_message}") print(f" Generated: {result.generated_code.replace(chr(10), '; ')}")Print a detailed evaluation report.
class LodaGenerator (model: LodaT5Model,
max_length: int = 256,
num_beams: int = 4)-
Expand source code
class LodaGenerator: """Generator class for creating LODA code from natural language.""" def __init__(self, model: LodaT5Model, max_length: int = 256, num_beams: int = 4): """ Initialize the generator. Args: model: Trained LodaT5Model max_length: Maximum length of generated code num_beams: Number of beams for beam search """ self.model = model self.max_length = max_length self.num_beams = num_beams def generate(self, description: str, num_samples: int = 1) -> List[GenerationResult]: """ Generate LODA code from a natural language description. Args: description: Natural language description of the sequence num_samples: Number of code samples to generate Returns: List of GenerationResult objects """ start_time = time.time() # Generate multiple samples descriptions = [description] * num_samples generated_codes = self.model.generate( descriptions, max_length=self.max_length, num_beams=self.num_beams ) generation_time = time.time() - start_time results = [] for code in generated_codes: result = self._validate_and_evaluate_code(description, code) result.generation_time = generation_time / num_samples results.append(result) return results def _validate_and_evaluate_code(self, description: str, code: str) -> GenerationResult: """ Validate and evaluate generated LODA code. Args: description: Original description code: Generated LODA code Returns: GenerationResult with validation info """ result = GenerationResult( description=description, generated_code=code, is_valid=False ) try: # Try to parse the program program = Program(code) # Try to evaluate it for a few terms interpreter = Interpreter(max_memory=100, max_stack=10, max_steps=10000) evaluator = Evaluator(program, interpreter) sequence_terms = [] for i in range(10): # Generate first 10 terms try: term = evaluator(i) sequence_terms.append(term) except Exception: break # Stop if evaluation fails if len(sequence_terms) >= 3: # At least 3 terms generated result.is_valid = True result.generated_sequence = sequence_terms else: result.error_message = "Could not generate sufficient sequence terms" except Exception as e: result.error_message = f"Program validation failed: {str(e)}" return result def generate_interactive(self): """Interactive mode for generating LODA code.""" print("LODA Code Generator - Interactive Mode") print("Enter natural language descriptions to generate LODA code.") print("Type 'quit' to exit.\n") while True: try: description = input("Description: ").strip() if description.lower() in ['quit', 'exit', 'q']: print("Goodbye!") break if not description: continue print("Generating code...") results = self.generate(description, num_samples=1) for i, result in enumerate(results): print(f"\n--- Result {i+1} ---") print(f"Generated in {result.generation_time:.2f}s") print(f"Valid: {result.is_valid}") if result.error_message: print(f"Error: {result.error_message}") print("Generated LODA code:") print(result.generated_code) if result.generated_sequence: print(f"Sequence terms: {result.generated_sequence}") print("-" * 50) except KeyboardInterrupt: print("\nGoodbye!") break except Exception as e: print(f"Error: {e}")Generator class for creating LODA code from natural language.
Initialize the generator.
Args
model- Trained LodaT5Model
max_length- Maximum length of generated code
num_beams- Number of beams for beam search
Methods
def generate(self, description: str, num_samples: int = 1) ‑> List[GenerationResult]-
Expand source code
def generate(self, description: str, num_samples: int = 1) -> List[GenerationResult]: """ Generate LODA code from a natural language description. Args: description: Natural language description of the sequence num_samples: Number of code samples to generate Returns: List of GenerationResult objects """ start_time = time.time() # Generate multiple samples descriptions = [description] * num_samples generated_codes = self.model.generate( descriptions, max_length=self.max_length, num_beams=self.num_beams ) generation_time = time.time() - start_time results = [] for code in generated_codes: result = self._validate_and_evaluate_code(description, code) result.generation_time = generation_time / num_samples results.append(result) return resultsGenerate LODA code from a natural language description.
Args
description- Natural language description of the sequence
num_samples- Number of code samples to generate
Returns
List of GenerationResult objects
def generate_interactive(self)-
Expand source code
def generate_interactive(self): """Interactive mode for generating LODA code.""" print("LODA Code Generator - Interactive Mode") print("Enter natural language descriptions to generate LODA code.") print("Type 'quit' to exit.\n") while True: try: description = input("Description: ").strip() if description.lower() in ['quit', 'exit', 'q']: print("Goodbye!") break if not description: continue print("Generating code...") results = self.generate(description, num_samples=1) for i, result in enumerate(results): print(f"\n--- Result {i+1} ---") print(f"Generated in {result.generation_time:.2f}s") print(f"Valid: {result.is_valid}") if result.error_message: print(f"Error: {result.error_message}") print("Generated LODA code:") print(result.generated_code) if result.generated_sequence: print(f"Sequence terms: {result.generated_sequence}") print("-" * 50) except KeyboardInterrupt: print("\nGoodbye!") break except Exception as e: print(f"Error: {e}")Interactive mode for generating LODA code.
class LodaT5Model (model_name: str = 't5-small', loda_vocab_size: int | None = None)-
Expand source code
class LodaT5Model(nn.Module): """ T5-based model for natural language to LODA code generation. """ def __init__(self, model_name: str = "t5-small", loda_vocab_size: Optional[int] = None): """ Initialize the model. Args: model_name: Base T5 model to use loda_vocab_size: Size of LODA vocabulary (if extending tokenizer) """ super().__init__() # Load base T5 model and tokenizer self.text_tokenizer = T5Tokenizer.from_pretrained(model_name) self.model = T5ForConditionalGeneration.from_pretrained(model_name) # Initialize LODA tokenizer self.loda_tokenizer = LodaTokenizer() # If we need to extend the vocabulary if loda_vocab_size and loda_vocab_size > self.loda_tokenizer.vocab_size: # Could extend vocabulary here if needed pass def prepare_input(self, descriptions: List[str]) -> Dict[str, torch.Tensor]: """ Prepare natural language descriptions for input. Args: descriptions: List of natural language descriptions Returns: Dictionary with input tensors """ # Add task prefix for T5 prefixed_descriptions = [f"translate to loda: {desc}" for desc in descriptions] # Tokenize with T5 tokenizer encoded = self.text_tokenizer( prefixed_descriptions, padding=True, truncation=True, max_length=512, return_tensors="pt" ) return encoded def prepare_target(self, loda_codes: List[str]) -> Dict[str, torch.Tensor]: """ Prepare LODA codes as targets. Args: loda_codes: List of LODA assembly codes Returns: Dictionary with target tensors """ # For T5, we need to encode targets using the text tokenizer as well # We'll create a custom format that represents LODA code # Convert LODA to a text representation that T5 can understand text_loda_codes = [] for code in loda_codes: # Convert LODA code to a more text-like format text_code = self.loda_to_text_format(code) text_loda_codes.append(text_code) encoded = self.text_tokenizer( text_loda_codes, padding=True, truncation=True, max_length=256, return_tensors="pt" ) return encoded def loda_to_text_format(self, code: str) -> str: """ Convert LODA code to a text format suitable for T5. This creates a more natural language representation of LODA code. Args: code: LODA assembly code Returns: Text representation of the code """ lines = code.strip().split('\n') text_parts = [] for line in lines: line = line.strip() if not line: continue # Parse the line and convert to text parts = line.replace(',', ' ').split() if len(parts) >= 3: op, target, source = parts[0], parts[1], parts[2] text_parts.append(f"{op} {target} {source}") elif len(parts) >= 2: op, target = parts[0], parts[1] text_parts.append(f"{op} {target}") else: text_parts.append(line) return " | ".join(text_parts) def text_format_to_loda(self, text_code: str) -> str: """ Convert text format back to LODA code. Args: text_code: Text representation of LODA code Returns: LODA assembly code """ parts = text_code.split(" | ") loda_lines = [] for part in parts: part = part.strip() if not part: continue tokens = part.split() if len(tokens) >= 3: op, target, source = tokens[0], tokens[1], tokens[2] loda_lines.append(f"{op} {target},{source}") elif len(tokens) >= 2: op, target = tokens[0], tokens[1] loda_lines.append(f"{op} {target}") else: loda_lines.append(part) return '\n'.join(loda_lines) def forward(self, input_ids, attention_mask, labels=None): """ Forward pass of the model. Args: input_ids: Input token IDs attention_mask: Attention mask labels: Target labels (for training) Returns: Model outputs """ return self.model( input_ids=input_ids, attention_mask=attention_mask, labels=labels ) def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) -> List[str]: """ Generate LODA code from natural language descriptions. Args: descriptions: List of natural language descriptions max_length: Maximum length of generated sequences num_beams: Number of beams for beam search Returns: List of generated LODA codes """ # Prepare input inputs = self.prepare_input(descriptions) # Generate with the model with torch.no_grad(): generated_ids = self.model.generate( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length, num_beams=num_beams, early_stopping=True, do_sample=False ) # Decode generated sequences generated_texts = self.text_tokenizer.batch_decode(generated_ids, skip_special_tokens=True) # Convert from text format back to LODA loda_codes = [self.text_format_to_loda(text) for text in generated_texts] return loda_codes def save_model(self, save_path: str): """ Save the model and tokenizers. Args: save_path: Directory to save the model """ os.makedirs(save_path, exist_ok=True) # Save T5 model and tokenizer self.model.save_pretrained(save_path) self.text_tokenizer.save_pretrained(save_path) # Save LODA tokenizer loda_tokenizer_path = os.path.join(save_path, "loda_tokenizer.json") with open(loda_tokenizer_path, 'w') as f: json.dump({ 'vocab': self.loda_tokenizer.vocab, 'reverse_vocab': {str(k): v for k, v in self.loda_tokenizer.reverse_vocab.items()} }, f, indent=2) @classmethod def load_model(cls, load_path: str): """ Load a saved model. Args: load_path: Directory containing the saved model Returns: Loaded LodaT5Model instance """ # Load T5 model and tokenizer model = T5ForConditionalGeneration.from_pretrained(load_path) text_tokenizer = T5Tokenizer.from_pretrained(load_path) # Create model instance loda_model = cls() loda_model.model = model loda_model.text_tokenizer = text_tokenizer # Load LODA tokenizer if it exists loda_tokenizer_path = os.path.join(load_path, "loda_tokenizer.json") if os.path.exists(loda_tokenizer_path): with open(loda_tokenizer_path, 'r') as f: tokenizer_data = json.load(f) loda_model.loda_tokenizer.vocab = tokenizer_data['vocab'] loda_model.loda_tokenizer.reverse_vocab = { int(k): v for k, v in tokenizer_data['reverse_vocab'].items() } return loda_modelT5-based model for natural language to LODA code generation.
Initialize the model.
Args
model_name- Base T5 model to use
loda_vocab_size- Size of LODA vocabulary (if extending tokenizer)
Ancestors
- torch.nn.modules.module.Module
Static methods
def load_model(load_path: str)-
Load a saved model.
Args
load_path- Directory containing the saved model
Returns
Loaded LodaT5Model instance
Methods
def forward(self, input_ids, attention_mask, labels=None) ‑> Callable[..., Any]-
Expand source code
def forward(self, input_ids, attention_mask, labels=None): """ Forward pass of the model. Args: input_ids: Input token IDs attention_mask: Attention mask labels: Target labels (for training) Returns: Model outputs """ return self.model( input_ids=input_ids, attention_mask=attention_mask, labels=labels )Forward pass of the model.
Args
input_ids- Input token IDs
attention_mask- Attention mask
labels- Target labels (for training)
Returns
Model outputs
def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) ‑> List[str]-
Expand source code
def generate(self, descriptions: List[str], max_length: int = 256, num_beams: int = 4) -> List[str]: """ Generate LODA code from natural language descriptions. Args: descriptions: List of natural language descriptions max_length: Maximum length of generated sequences num_beams: Number of beams for beam search Returns: List of generated LODA codes """ # Prepare input inputs = self.prepare_input(descriptions) # Generate with the model with torch.no_grad(): generated_ids = self.model.generate( input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=max_length, num_beams=num_beams, early_stopping=True, do_sample=False ) # Decode generated sequences generated_texts = self.text_tokenizer.batch_decode(generated_ids, skip_special_tokens=True) # Convert from text format back to LODA loda_codes = [self.text_format_to_loda(text) for text in generated_texts] return loda_codesGenerate LODA code from natural language descriptions.
Args
descriptions- List of natural language descriptions
max_length- Maximum length of generated sequences
num_beams- Number of beams for beam search
Returns
List of generated LODA codes
def loda_to_text_format(self, code: str) ‑> str-
Expand source code
def loda_to_text_format(self, code: str) -> str: """ Convert LODA code to a text format suitable for T5. This creates a more natural language representation of LODA code. Args: code: LODA assembly code Returns: Text representation of the code """ lines = code.strip().split('\n') text_parts = [] for line in lines: line = line.strip() if not line: continue # Parse the line and convert to text parts = line.replace(',', ' ').split() if len(parts) >= 3: op, target, source = parts[0], parts[1], parts[2] text_parts.append(f"{op} {target} {source}") elif len(parts) >= 2: op, target = parts[0], parts[1] text_parts.append(f"{op} {target}") else: text_parts.append(line) return " | ".join(text_parts)Convert LODA code to a text format suitable for T5.
This creates a more natural language representation of LODA code.
Args
code- LODA assembly code
Returns
Text representation of the code
def prepare_input(self, descriptions: List[str]) ‑> Dict[str, torch.Tensor]-
Expand source code
def prepare_input(self, descriptions: List[str]) -> Dict[str, torch.Tensor]: """ Prepare natural language descriptions for input. Args: descriptions: List of natural language descriptions Returns: Dictionary with input tensors """ # Add task prefix for T5 prefixed_descriptions = [f"translate to loda: {desc}" for desc in descriptions] # Tokenize with T5 tokenizer encoded = self.text_tokenizer( prefixed_descriptions, padding=True, truncation=True, max_length=512, return_tensors="pt" ) return encodedPrepare natural language descriptions for input.
Args
descriptions- List of natural language descriptions
Returns
Dictionary with input tensors
def prepare_target(self, loda_codes: List[str]) ‑> Dict[str, torch.Tensor]-
Expand source code
def prepare_target(self, loda_codes: List[str]) -> Dict[str, torch.Tensor]: """ Prepare LODA codes as targets. Args: loda_codes: List of LODA assembly codes Returns: Dictionary with target tensors """ # For T5, we need to encode targets using the text tokenizer as well # We'll create a custom format that represents LODA code # Convert LODA to a text representation that T5 can understand text_loda_codes = [] for code in loda_codes: # Convert LODA code to a more text-like format text_code = self.loda_to_text_format(code) text_loda_codes.append(text_code) encoded = self.text_tokenizer( text_loda_codes, padding=True, truncation=True, max_length=256, return_tensors="pt" ) return encodedPrepare LODA codes as targets.
Args
loda_codes- List of LODA assembly codes
Returns
Dictionary with target tensors
def save_model(self, save_path: str)-
Expand source code
def save_model(self, save_path: str): """ Save the model and tokenizers. Args: save_path: Directory to save the model """ os.makedirs(save_path, exist_ok=True) # Save T5 model and tokenizer self.model.save_pretrained(save_path) self.text_tokenizer.save_pretrained(save_path) # Save LODA tokenizer loda_tokenizer_path = os.path.join(save_path, "loda_tokenizer.json") with open(loda_tokenizer_path, 'w') as f: json.dump({ 'vocab': self.loda_tokenizer.vocab, 'reverse_vocab': {str(k): v for k, v in self.loda_tokenizer.reverse_vocab.items()} }, f, indent=2)Save the model and tokenizers.
Args
save_path- Directory to save the model
def text_format_to_loda(self, text_code: str) ‑> str-
Expand source code
def text_format_to_loda(self, text_code: str) -> str: """ Convert text format back to LODA code. Args: text_code: Text representation of LODA code Returns: LODA assembly code """ parts = text_code.split(" | ") loda_lines = [] for part in parts: part = part.strip() if not part: continue tokens = part.split() if len(tokens) >= 3: op, target, source = tokens[0], tokens[1], tokens[2] loda_lines.append(f"{op} {target},{source}") elif len(tokens) >= 2: op, target = tokens[0], tokens[1] loda_lines.append(f"{op} {target}") else: loda_lines.append(part) return '\n'.join(loda_lines)Convert text format back to LODA code.
Args
text_code- Text representation of LODA code
Returns
LODA assembly code
class LodaTokenizer-
Expand source code
class LodaTokenizer: """Custom tokenizer for LODA assembly language.""" def __init__(self): """Initialize LODA tokenizer with vocabulary.""" # LODA operations self.operations = [ 'mov', 'add', 'sub', 'mul', 'div', 'dif', 'mod', 'pow', 'gcd', 'bin', 'cmp', 'min', 'max', 'lpb', 'lpe', 'nop', 'cal', 'seq', 'trn', 'clr' ] # Common operand patterns self.operand_patterns = [ # Direct memory references '$0', '$1', '$2', '$3', '$4', '$5', '$6', '$7', '$8', '$9', '$10', # Indirect memory references '$$1', '$$2', '$$3', '$$4', '$$5', # Common constants '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '-1' ] # Special tokens self.special_tokens = ['<pad>', '<unk>', '<s>', '</s>', '<mask>'] # Build vocabulary self.vocab = {} self.reverse_vocab = {} # Add special tokens first for i, token in enumerate(self.special_tokens): self.vocab[token] = i self.reverse_vocab[i] = token # Add operations for token in self.operations: idx = len(self.vocab) self.vocab[token] = idx self.reverse_vocab[idx] = token # Add operand patterns for token in self.operand_patterns: idx = len(self.vocab) self.vocab[token] = idx self.reverse_vocab[idx] = token self.vocab_size = len(self.vocab) self.pad_token_id = self.vocab['<pad>'] self.unk_token_id = self.vocab['<unk>'] self.bos_token_id = self.vocab['<s>'] self.eos_token_id = self.vocab['</s>'] def tokenize_loda_code(self, code: str) -> List[str]: """ Tokenize LODA assembly code. Args: code: LODA assembly code as string Returns: List of tokens """ lines = code.strip().split('\n') tokens = ['<s>'] # Start token for line in lines: line = line.strip() if not line: continue # Split on whitespace and comma parts = line.replace(',', ' ').split() for part in parts: part = part.strip() if part in self.vocab: tokens.append(part) else: # Try to handle unknown operands if part.startswith('$') and part[1:].isdigit(): # Direct memory reference if part in self.vocab: tokens.append(part) else: tokens.append('<unk>') elif part.startswith('$$') and part[2:].isdigit(): # Indirect memory reference if part in self.vocab: tokens.append(part) else: tokens.append('<unk>') elif part.lstrip('-').isdigit(): # Numeric constant if part in self.vocab: tokens.append(part) else: tokens.append('<unk>') else: tokens.append('<unk>') tokens.append('</s>') # End token return tokens def encode_loda_code(self, code: str) -> List[int]: """ Encode LODA code to token IDs. Args: code: LODA assembly code Returns: List of token IDs """ tokens = self.tokenize_loda_code(code) return [self.vocab.get(token, self.unk_token_id) for token in tokens] def decode_loda_code(self, token_ids: List[int]) -> str: """ Decode token IDs back to LODA code. Args: token_ids: List of token IDs Returns: LODA assembly code as string """ tokens = [self.reverse_vocab.get(id, '<unk>') for id in token_ids] # Filter out special tokens filtered_tokens = [] for token in tokens: if token in ['<s>', '</s>', '<pad>']: continue if token == '<unk>': continue filtered_tokens.append(token) # Reconstruct LODA code code_lines = [] i = 0 while i < len(filtered_tokens): if i + 2 < len(filtered_tokens): # Try to form operation: op target source op = filtered_tokens[i] if op in self.operations and i + 2 < len(filtered_tokens): target = filtered_tokens[i + 1] source = filtered_tokens[i + 2] code_lines.append(f"{op} {target},{source}") i += 3 elif op in self.operations and i + 1 < len(filtered_tokens): # Single operand operation target = filtered_tokens[i + 1] code_lines.append(f"{op} {target}") i += 2 else: i += 1 else: i += 1 return '\n'.join(code_lines)Custom tokenizer for LODA assembly language.
Initialize LODA tokenizer with vocabulary.
Methods
def decode_loda_code(self, token_ids: List[int]) ‑> str-
Expand source code
def decode_loda_code(self, token_ids: List[int]) -> str: """ Decode token IDs back to LODA code. Args: token_ids: List of token IDs Returns: LODA assembly code as string """ tokens = [self.reverse_vocab.get(id, '<unk>') for id in token_ids] # Filter out special tokens filtered_tokens = [] for token in tokens: if token in ['<s>', '</s>', '<pad>']: continue if token == '<unk>': continue filtered_tokens.append(token) # Reconstruct LODA code code_lines = [] i = 0 while i < len(filtered_tokens): if i + 2 < len(filtered_tokens): # Try to form operation: op target source op = filtered_tokens[i] if op in self.operations and i + 2 < len(filtered_tokens): target = filtered_tokens[i + 1] source = filtered_tokens[i + 2] code_lines.append(f"{op} {target},{source}") i += 3 elif op in self.operations and i + 1 < len(filtered_tokens): # Single operand operation target = filtered_tokens[i + 1] code_lines.append(f"{op} {target}") i += 2 else: i += 1 else: i += 1 return '\n'.join(code_lines)Decode token IDs back to LODA code.
Args
token_ids- List of token IDs
Returns
LODA assembly code as string
def encode_loda_code(self, code: str) ‑> List[int]-
Expand source code
def encode_loda_code(self, code: str) -> List[int]: """ Encode LODA code to token IDs. Args: code: LODA assembly code Returns: List of token IDs """ tokens = self.tokenize_loda_code(code) return [self.vocab.get(token, self.unk_token_id) for token in tokens]Encode LODA code to token IDs.
Args
code- LODA assembly code
Returns
List of token IDs
def tokenize_loda_code(self, code: str) ‑> List[str]-
Expand source code
def tokenize_loda_code(self, code: str) -> List[str]: """ Tokenize LODA assembly code. Args: code: LODA assembly code as string Returns: List of tokens """ lines = code.strip().split('\n') tokens = ['<s>'] # Start token for line in lines: line = line.strip() if not line: continue # Split on whitespace and comma parts = line.replace(',', ' ').split() for part in parts: part = part.strip() if part in self.vocab: tokens.append(part) else: # Try to handle unknown operands if part.startswith('$') and part[1:].isdigit(): # Direct memory reference if part in self.vocab: tokens.append(part) else: tokens.append('<unk>') elif part.startswith('$$') and part[2:].isdigit(): # Indirect memory reference if part in self.vocab: tokens.append(part) else: tokens.append('<unk>') elif part.lstrip('-').isdigit(): # Numeric constant if part in self.vocab: tokens.append(part) else: tokens.append('<unk>') else: tokens.append('<unk>') tokens.append('</s>') # End token return tokensTokenize LODA assembly code.
Args
code- LODA assembly code as string
Returns
List of tokens
class LodaTrainer (model: LodaT5Model,
train_dataset: LodaDataset,
val_dataset: LodaDataset | None = None,
learning_rate: float = 5e-05,
batch_size: int = 8,
num_epochs: int = 3,
warmup_steps: int = 500,
save_dir: str = 'loda_llm_model')-
Expand source code
class LodaTrainer: """Trainer class for LODA LLM.""" def __init__(self, model: LodaT5Model, train_dataset: LodaDataset, val_dataset: Optional[LodaDataset] = None, learning_rate: float = 5e-5, batch_size: int = 8, num_epochs: int = 3, warmup_steps: int = 500, save_dir: str = "loda_llm_model"): """ Initialize the trainer. Args: model: LodaT5Model to train train_dataset: Training dataset val_dataset: Validation dataset (optional) learning_rate: Learning rate batch_size: Batch size num_epochs: Number of training epochs warmup_steps: Number of warmup steps for learning rate schedule save_dir: Directory to save the model """ self.model = model self.train_dataset = train_dataset self.val_dataset = val_dataset self.learning_rate = learning_rate self.batch_size = batch_size self.num_epochs = num_epochs self.warmup_steps = warmup_steps self.save_dir = save_dir # Set up device self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') self.model.model.to(self.device) # Set up data loaders self.train_loader = DataLoader( train_dataset, batch_size=batch_size, shuffle=True, collate_fn=self._collate_fn ) if val_dataset: self.val_loader = DataLoader( val_dataset, batch_size=batch_size, shuffle=False, collate_fn=self._collate_fn ) # Set up optimizer self.optimizer = AdamW( self.model.model.parameters(), lr=learning_rate, weight_decay=0.01 ) # Set up learning rate scheduler total_steps = len(self.train_loader) * num_epochs self.scheduler = get_linear_schedule_with_warmup( self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps ) def _collate_fn(self, batch): """Collate function for DataLoader.""" # Pad sequences to the same length input_ids = [item['input_ids'] for item in batch] attention_masks = [item['attention_mask'] for item in batch] labels = [item['labels'] for item in batch] decoder_attention_masks = [item['decoder_attention_mask'] for item in batch] # Pad input sequences max_input_len = max(len(seq) for seq in input_ids) padded_input_ids = [] padded_attention_masks = [] for i in range(len(input_ids)): seq_len = len(input_ids[i]) pad_len = max_input_len - seq_len padded_input_ids.append( torch.cat([input_ids[i], torch.zeros(pad_len, dtype=torch.long)]) ) padded_attention_masks.append( torch.cat([attention_masks[i], torch.zeros(pad_len, dtype=torch.long)]) ) # Pad target sequences max_target_len = max(len(seq) for seq in labels) padded_labels = [] padded_decoder_masks = [] for i in range(len(labels)): seq_len = len(labels[i]) pad_len = max_target_len - seq_len # For labels, use -100 for padding (ignored in loss calculation) padded_labels.append( torch.cat([labels[i], torch.full((pad_len,), -100, dtype=torch.long)]) ) padded_decoder_masks.append( torch.cat([decoder_attention_masks[i], torch.zeros(pad_len, dtype=torch.long)]) ) return { 'input_ids': torch.stack(padded_input_ids), 'attention_mask': torch.stack(padded_attention_masks), 'labels': torch.stack(padded_labels), 'decoder_attention_mask': torch.stack(padded_decoder_masks) } def train_epoch(self): """Train for one epoch.""" self.model.model.train() total_loss = 0 progress_bar = tqdm(self.train_loader, desc="Training") for batch in progress_bar: # Move to device batch = {k: v.to(self.device) for k, v in batch.items()} # Forward pass outputs = self.model.forward( input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'] ) loss = outputs.loss total_loss += loss.item() # Backward pass loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm_(self.model.model.parameters(), 1.0) # Update parameters self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() # Update progress bar progress_bar.set_postfix({'loss': loss.item()}) return total_loss / len(self.train_loader) def validate(self): """Validate the model.""" if not self.val_dataset: return None self.model.model.eval() total_loss = 0 with torch.no_grad(): progress_bar = tqdm(self.val_loader, desc="Validation") for batch in progress_bar: # Move to device batch = {k: v.to(self.device) for k, v in batch.items()} # Forward pass outputs = self.model.forward( input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'] ) loss = outputs.loss total_loss += loss.item() progress_bar.set_postfix({'val_loss': loss.item()}) return total_loss / len(self.val_loader) def train(self): """Train the model.""" print(f"Training on device: {self.device}") print(f"Training examples: {len(self.train_dataset)}") if self.val_dataset: print(f"Validation examples: {len(self.val_dataset)}") best_val_loss = float('inf') for epoch in range(self.num_epochs): print(f"\nEpoch {epoch + 1}/{self.num_epochs}") # Train train_loss = self.train_epoch() print(f"Training loss: {train_loss:.4f}") # Validate val_loss = self.validate() if val_loss is not None: print(f"Validation loss: {val_loss:.4f}") # Save best model if val_loss < best_val_loss: best_val_loss = val_loss self.save_model(f"{self.save_dir}_best") print("Saved best model") # Save checkpoint self.save_model(f"{self.save_dir}_epoch_{epoch + 1}") print("\nTraining completed!") return self.model def save_model(self, path: str): """Save the model.""" self.model.save_model(path)Trainer class for LODA LLM.
Initialize the trainer.
Args
model- LodaT5Model to train
train_dataset- Training dataset
val_dataset- Validation dataset (optional)
learning_rate- Learning rate
batch_size- Batch size
num_epochs- Number of training epochs
warmup_steps- Number of warmup steps for learning rate schedule
save_dir- Directory to save the model
Methods
def save_model(self, path: str)-
Expand source code
def save_model(self, path: str): """Save the model.""" self.model.save_model(path)Save the model.
def train(self)-
Expand source code
def train(self): """Train the model.""" print(f"Training on device: {self.device}") print(f"Training examples: {len(self.train_dataset)}") if self.val_dataset: print(f"Validation examples: {len(self.val_dataset)}") best_val_loss = float('inf') for epoch in range(self.num_epochs): print(f"\nEpoch {epoch + 1}/{self.num_epochs}") # Train train_loss = self.train_epoch() print(f"Training loss: {train_loss:.4f}") # Validate val_loss = self.validate() if val_loss is not None: print(f"Validation loss: {val_loss:.4f}") # Save best model if val_loss < best_val_loss: best_val_loss = val_loss self.save_model(f"{self.save_dir}_best") print("Saved best model") # Save checkpoint self.save_model(f"{self.save_dir}_epoch_{epoch + 1}") print("\nTraining completed!") return self.modelTrain the model.
def train_epoch(self)-
Expand source code
def train_epoch(self): """Train for one epoch.""" self.model.model.train() total_loss = 0 progress_bar = tqdm(self.train_loader, desc="Training") for batch in progress_bar: # Move to device batch = {k: v.to(self.device) for k, v in batch.items()} # Forward pass outputs = self.model.forward( input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'] ) loss = outputs.loss total_loss += loss.item() # Backward pass loss.backward() # Clip gradients torch.nn.utils.clip_grad_norm_(self.model.model.parameters(), 1.0) # Update parameters self.optimizer.step() self.scheduler.step() self.optimizer.zero_grad() # Update progress bar progress_bar.set_postfix({'loss': loss.item()}) return total_loss / len(self.train_loader)Train for one epoch.
def validate(self)-
Expand source code
def validate(self): """Validate the model.""" if not self.val_dataset: return None self.model.model.eval() total_loss = 0 with torch.no_grad(): progress_bar = tqdm(self.val_loader, desc="Validation") for batch in progress_bar: # Move to device batch = {k: v.to(self.device) for k, v in batch.items()} # Forward pass outputs = self.model.forward( input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'] ) loss = outputs.loss total_loss += loss.item() progress_bar.set_postfix({'val_loss': loss.item()}) return total_loss / len(self.val_loader)Validate the model.
class TrainingExample (sequence_id: str,
description: str,
loda_code: str,
terms: List[int] | None = None)-
Expand source code
@dataclass class TrainingExample: """A single training example pairing natural language with LODA code.""" sequence_id: str description: str loda_code: str terms: Optional[List[int]] = NoneA single training example pairing natural language with LODA code.
Instance variables
var description : strvar loda_code : strvar sequence_id : strvar terms : List[int] | None