Module loda.llm.data_preprocessing

Data preprocessing utilities for LLM training on OEIS sequences and LODA programs.

This module handles: 1. Extracting sequence descriptions from LODA program comments 2. Pairing natural language descriptions with LODA code 3. Creating training datasets for sequence-to-sequence models 4. Tokenization and data formatting for transformer models

Functions

def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True)
Expand source code
def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True):
    """
    Convenience function to create and save a training dataset.
    
    Args:
        programs_dir: Path to OEIS programs directory
        output_file: Path to save the dataset
        max_examples: Maximum number of examples (-1 for all)
        augment: Whether to augment with description variations
    """
    preprocessor = DataPreprocessor(programs_dir)
    examples = preprocessor.create_training_examples(max_examples)
    
    if augment:
        examples = preprocessor.augment_descriptions(examples)
    
    preprocessor.save_dataset(examples, output_file)
    return examples

Convenience function to create and save a training dataset.

Args

programs_dir
Path to OEIS programs directory
output_file
Path to save the dataset
max_examples
Maximum number of examples (-1 for all)
augment
Whether to augment with description variations

Classes

class DataPreprocessor (programs_dir: str)
Expand source code
class DataPreprocessor:
    """Handles preprocessing of OEIS programs for LLM training."""
    
    def __init__(self, programs_dir: str):
        """Initialize with path to OEIS programs directory."""
        self.programs_dir = programs_dir
        self.program_cache = ProgramCache(programs_dir)
        
    def extract_description_from_program(self, program_text: str) -> Optional[str]:
        """
        Extract the natural language description from a LODA program.
        
        LODA programs typically start with comments like:
        ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            The description string or None if no description found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for OEIS description lines (start with ; A######:)
            match = re.match(r';\s*A\d{6}:\s*(.+)', line)
            if match:
                description = match.group(1).strip()
                # Clean up common artifacts
                description = description.rstrip('.')
                # Remove mathematical notation that might be confusing
                # Keep it simple for initial training
                return description
                
        return None
    
    def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
        """
        Extract the sequence terms from a LODA program comment.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            List of sequence terms or None if not found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for lines with comma-separated numbers (sequence terms)
            if line.startswith(';') and ',' in line:
                # Extract numbers from the line
                numbers_str = line[1:].strip()  # Remove the ';'
                # Skip if it looks like it contains non-numeric content
                if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                    continue
                    
                try:
                    terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                    if len(terms) >= 5:  # Reasonable number of terms
                        return terms
                except ValueError:
                    continue
                    
        return None
    
    def clean_loda_code(self, program_text: str) -> str:
        """
        Clean LODA code by removing comments and normalizing format.
        
        Args:
            program_text: Raw LODA program text
            
        Returns:
            Cleaned LODA code suitable for training
        """
        lines = program_text.strip().split('\n')
        code_lines = []
        
        for line in lines:
            # Skip comment lines (lines that start with ;)
            if line.strip().startswith(';'):
                continue
            # Skip empty lines
            if not line.strip():
                continue
            
            # Remove inline comments (everything after ; on the same line)
            if ';' in line:
                code_part = line.split(';')[0].strip()
            else:
                code_part = line.strip()
            
            # Only add non-empty code lines
            if code_part:
                code_lines.append(code_part)
        
        return '\n'.join(code_lines)
    
    def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
        """
        Create training examples from all available LODA programs.
        
        Args:
            max_examples: Maximum number of examples to create (-1 for all)
            
        Returns:
            List of TrainingExample objects
        """
        examples = []
        program_ids = self.program_cache.all_ids()
        
        if max_examples > 0:
            program_ids = program_ids[:max_examples]
        
        print(f"Processing {len(program_ids)} programs...")
        
        for i, program_id in enumerate(program_ids):
            if i % 1000 == 0:
                print(f"Processed {i}/{len(program_ids)} programs")
                
            try:
                # Read the program file
                program_path = self.program_cache.path(program_id)
                if not os.path.exists(program_path):
                    continue
                    
                with open(program_path, 'r') as f:
                    program_text = f.read()
                
                # Extract description
                description = self.extract_description_from_program(program_text)
                if not description:
                    continue
                
                # Extract terms (optional)
                terms = self.extract_terms_from_program(program_text)
                
                # Clean the LODA code
                clean_code = self.clean_loda_code(program_text)
                if not clean_code:
                    continue
                
                # Validate that the code parses correctly
                try:
                    Program(clean_code)
                except Exception:
                    continue  # Skip programs that don't parse
                
                example = TrainingExample(
                    sequence_id=str(program_id),
                    description=description,
                    loda_code=clean_code,
                    terms=terms
                )
                examples.append(example)
                
            except Exception as e:
                print(f"Error processing {program_id}: {e}")
                continue
        
        print(f"Created {len(examples)} training examples")
        return examples
    
    def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
        """
        Augment training examples with variations of descriptions.
        
        This can help make the model more robust to different phrasings.
        
        Args:
            examples: List of original training examples
            
        Returns:
            Augmented list with additional variations
        """
        augmented = list(examples)  # Start with originals
        
        for example in examples:
            desc = example.description
            
            # Create variations
            variations = []
            
            # Add "sequence of" prefix if not present
            if not desc.lower().startswith(('sequence', 'the sequence')):
                variations.append(f"Sequence of {desc.lower()}")
            
            # Add "Generate" prefix
            variations.append(f"Generate {desc.lower()}")
            
            # Add "Compute" prefix
            variations.append(f"Compute {desc.lower()}")
            
            # Remove mathematical symbols for simpler versions
            simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
            simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
            if simple_desc != desc and simple_desc:
                variations.append(simple_desc)
            
            # Create new examples for each variation
            for variation in variations:
                augmented_example = TrainingExample(
                    sequence_id=str(example.sequence_id) + "_aug",
                    description=variation,
                    loda_code=example.loda_code,
                    terms=example.terms
                )
                augmented.append(augmented_example)
        
        return augmented
    
    def save_dataset(self, examples: List[TrainingExample], output_file: str):
        """
        Save training examples to a file for later use.
        
        Args:
            examples: List of training examples
            output_file: Path to output file
        """
        import json
        
        data = []
        for example in examples:
            data.append({
                'sequence_id': example.sequence_id,
                'description': example.description,
                'loda_code': example.loda_code,
                'terms': example.terms
            })
        
        with open(output_file, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"Saved {len(examples)} examples to {output_file}")
    
    def load_dataset(self, input_file: str) -> List[TrainingExample]:
        """
        Load training examples from a file.
        
        Args:
            input_file: Path to input file
            
        Returns:
            List of TrainingExample objects
        """
        import json
        
        with open(input_file, 'r') as f:
            data = json.load(f)
        
        examples = []
        for item in data:
            example = TrainingExample(
                sequence_id=item['sequence_id'],
                description=item['description'],
                loda_code=item['loda_code'],
                terms=item.get('terms')
            )
            examples.append(example)
        
        print(f"Loaded {len(examples)} examples from {input_file}")
        return examples

Handles preprocessing of OEIS programs for LLM training.

Initialize with path to OEIS programs directory.

Methods

def augment_descriptions(self,
examples: List[TrainingExample]) ‑> List[TrainingExample]
Expand source code
def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
    """
    Augment training examples with variations of descriptions.
    
    This can help make the model more robust to different phrasings.
    
    Args:
        examples: List of original training examples
        
    Returns:
        Augmented list with additional variations
    """
    augmented = list(examples)  # Start with originals
    
    for example in examples:
        desc = example.description
        
        # Create variations
        variations = []
        
        # Add "sequence of" prefix if not present
        if not desc.lower().startswith(('sequence', 'the sequence')):
            variations.append(f"Sequence of {desc.lower()}")
        
        # Add "Generate" prefix
        variations.append(f"Generate {desc.lower()}")
        
        # Add "Compute" prefix
        variations.append(f"Compute {desc.lower()}")
        
        # Remove mathematical symbols for simpler versions
        simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
        simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
        if simple_desc != desc and simple_desc:
            variations.append(simple_desc)
        
        # Create new examples for each variation
        for variation in variations:
            augmented_example = TrainingExample(
                sequence_id=str(example.sequence_id) + "_aug",
                description=variation,
                loda_code=example.loda_code,
                terms=example.terms
            )
            augmented.append(augmented_example)
    
    return augmented

Augment training examples with variations of descriptions.

This can help make the model more robust to different phrasings.

Args

examples
List of original training examples

Returns

Augmented list with additional variations

def clean_loda_code(self, program_text: str) ‑> str
Expand source code
def clean_loda_code(self, program_text: str) -> str:
    """
    Clean LODA code by removing comments and normalizing format.
    
    Args:
        program_text: Raw LODA program text
        
    Returns:
        Cleaned LODA code suitable for training
    """
    lines = program_text.strip().split('\n')
    code_lines = []
    
    for line in lines:
        # Skip comment lines (lines that start with ;)
        if line.strip().startswith(';'):
            continue
        # Skip empty lines
        if not line.strip():
            continue
        
        # Remove inline comments (everything after ; on the same line)
        if ';' in line:
            code_part = line.split(';')[0].strip()
        else:
            code_part = line.strip()
        
        # Only add non-empty code lines
        if code_part:
            code_lines.append(code_part)
    
    return '\n'.join(code_lines)

Clean LODA code by removing comments and normalizing format.

Args

program_text
Raw LODA program text

Returns

Cleaned LODA code suitable for training

def create_training_examples(self, max_examples: int = -1) ‑> List[TrainingExample]
Expand source code
def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
    """
    Create training examples from all available LODA programs.
    
    Args:
        max_examples: Maximum number of examples to create (-1 for all)
        
    Returns:
        List of TrainingExample objects
    """
    examples = []
    program_ids = self.program_cache.all_ids()
    
    if max_examples > 0:
        program_ids = program_ids[:max_examples]
    
    print(f"Processing {len(program_ids)} programs...")
    
    for i, program_id in enumerate(program_ids):
        if i % 1000 == 0:
            print(f"Processed {i}/{len(program_ids)} programs")
            
        try:
            # Read the program file
            program_path = self.program_cache.path(program_id)
            if not os.path.exists(program_path):
                continue
                
            with open(program_path, 'r') as f:
                program_text = f.read()
            
            # Extract description
            description = self.extract_description_from_program(program_text)
            if not description:
                continue
            
            # Extract terms (optional)
            terms = self.extract_terms_from_program(program_text)
            
            # Clean the LODA code
            clean_code = self.clean_loda_code(program_text)
            if not clean_code:
                continue
            
            # Validate that the code parses correctly
            try:
                Program(clean_code)
            except Exception:
                continue  # Skip programs that don't parse
            
            example = TrainingExample(
                sequence_id=str(program_id),
                description=description,
                loda_code=clean_code,
                terms=terms
            )
            examples.append(example)
            
        except Exception as e:
            print(f"Error processing {program_id}: {e}")
            continue
    
    print(f"Created {len(examples)} training examples")
    return examples

Create training examples from all available LODA programs.

Args

max_examples
Maximum number of examples to create (-1 for all)

Returns

List of TrainingExample objects

def extract_description_from_program(self, program_text: str) ‑> str | None
Expand source code
def extract_description_from_program(self, program_text: str) -> Optional[str]:
    """
    Extract the natural language description from a LODA program.
    
    LODA programs typically start with comments like:
    ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        The description string or None if no description found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for OEIS description lines (start with ; A######:)
        match = re.match(r';\s*A\d{6}:\s*(.+)', line)
        if match:
            description = match.group(1).strip()
            # Clean up common artifacts
            description = description.rstrip('.')
            # Remove mathematical notation that might be confusing
            # Keep it simple for initial training
            return description
            
    return None

Extract the natural language description from a LODA program.

LODA programs typically start with comments like: ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.

Args

program_text
The full LODA program as text

Returns

The description string or None if no description found

def extract_terms_from_program(self, program_text: str) ‑> List[int] | None
Expand source code
def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
    """
    Extract the sequence terms from a LODA program comment.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        List of sequence terms or None if not found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for lines with comma-separated numbers (sequence terms)
        if line.startswith(';') and ',' in line:
            # Extract numbers from the line
            numbers_str = line[1:].strip()  # Remove the ';'
            # Skip if it looks like it contains non-numeric content
            if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                continue
                
            try:
                terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                if len(terms) >= 5:  # Reasonable number of terms
                    return terms
            except ValueError:
                continue
                
    return None

Extract the sequence terms from a LODA program comment.

Args

program_text
The full LODA program as text

Returns

List of sequence terms or None if not found

def load_dataset(self, input_file: str) ‑> List[TrainingExample]
Expand source code
def load_dataset(self, input_file: str) -> List[TrainingExample]:
    """
    Load training examples from a file.
    
    Args:
        input_file: Path to input file
        
    Returns:
        List of TrainingExample objects
    """
    import json
    
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    examples = []
    for item in data:
        example = TrainingExample(
            sequence_id=item['sequence_id'],
            description=item['description'],
            loda_code=item['loda_code'],
            terms=item.get('terms')
        )
        examples.append(example)
    
    print(f"Loaded {len(examples)} examples from {input_file}")
    return examples

Load training examples from a file.

Args

input_file
Path to input file

Returns

List of TrainingExample objects

def save_dataset(self,
examples: List[TrainingExample],
output_file: str)
Expand source code
def save_dataset(self, examples: List[TrainingExample], output_file: str):
    """
    Save training examples to a file for later use.
    
    Args:
        examples: List of training examples
        output_file: Path to output file
    """
    import json
    
    data = []
    for example in examples:
        data.append({
            'sequence_id': example.sequence_id,
            'description': example.description,
            'loda_code': example.loda_code,
            'terms': example.terms
        })
    
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Saved {len(examples)} examples to {output_file}")

Save training examples to a file for later use.

Args

examples
List of training examples
output_file
Path to output file
class TrainingExample (sequence_id: str,
description: str,
loda_code: str,
terms: List[int] | None = None)
Expand source code
@dataclass
class TrainingExample:
    """A single training example pairing natural language with LODA code."""
    sequence_id: str
    description: str
    loda_code: str
    terms: Optional[List[int]] = None

A single training example pairing natural language with LODA code.

Instance variables

var description : str
var loda_code : str
var sequence_id : str
var terms : List[int] | None