Module `loda.llm.data_preprocessing`

Data preprocessing utilities for LLM training on OEIS sequences and LODA programs.

This module handles: 1. Extracting sequence descriptions from LODA program comments 2. Pairing natural language descriptions with LODA code 3. Creating training datasets for sequence-to-sequence models 4. Tokenization and data formatting for transformer models

Functions

def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True)

Expand source code

def create_dataset(programs_dir: str, output_file: str, max_examples: int = -1, augment: bool = True):
    """
    Convenience function to create and save a training dataset.
    
    Args:
        programs_dir: Path to OEIS programs directory
        output_file: Path to save the dataset
        max_examples: Maximum number of examples (-1 for all)
        augment: Whether to augment with description variations
    """
    preprocessor = DataPreprocessor(programs_dir)
    examples = preprocessor.create_training_examples(max_examples)
    
    if augment:
        examples = preprocessor.augment_descriptions(examples)
    
    preprocessor.save_dataset(examples, output_file)
    return examples

Convenience function to create and save a training dataset.

Args

programs_dir: Path to OEIS programs directory
output_file: Path to save the dataset
max_examples: Maximum number of examples (-1 for all)
augment: Whether to augment with description variations

Classes

class DataPreprocessor (programs_dir: str)

Expand source code

class DataPreprocessor:
    """Handles preprocessing of OEIS programs for LLM training."""
    
    def __init__(self, programs_dir: str):
        """Initialize with path to OEIS programs directory."""
        self.programs_dir = programs_dir
        self.program_cache = ProgramCache(programs_dir)
        
    def extract_description_from_program(self, program_text: str) -> Optional[str]:
        """
        Extract the natural language description from a LODA program.
        
        LODA programs typically start with comments like:
        ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            The description string or None if no description found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for OEIS description lines (start with ; A######:)
            match = re.match(r';\s*A\d{6}:\s*(.+)', line)
            if match:
                description = match.group(1).strip()
                # Clean up common artifacts
                description = description.rstrip('.')
                # Remove mathematical notation that might be confusing
                # Keep it simple for initial training
                return description
                
        return None
    
    def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
        """
        Extract the sequence terms from a LODA program comment.
        
        Args:
            program_text: The full LODA program as text
            
        Returns:
            List of sequence terms or None if not found
        """
        lines = program_text.strip().split('\n')
        
        for line in lines:
            # Look for lines with comma-separated numbers (sequence terms)
            if line.startswith(';') and ',' in line:
                # Extract numbers from the line
                numbers_str = line[1:].strip()  # Remove the ';'
                # Skip if it looks like it contains non-numeric content
                if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                    continue
                    
                try:
                    terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                    if len(terms) >= 5:  # Reasonable number of terms
                        return terms
                except ValueError:
                    continue
                    
        return None
    
    def clean_loda_code(self, program_text: str) -> str:
        """
        Clean LODA code by removing comments and normalizing format.
        
        Args:
            program_text: Raw LODA program text
            
        Returns:
            Cleaned LODA code suitable for training
        """
        lines = program_text.strip().split('\n')
        code_lines = []
        
        for line in lines:
            # Skip comment lines (lines that start with ;)
            if line.strip().startswith(';'):
                continue
            # Skip empty lines
            if not line.strip():
                continue
            
            # Remove inline comments (everything after ; on the same line)
            if ';' in line:
                code_part = line.split(';')[0].strip()
            else:
                code_part = line.strip()
            
            # Only add non-empty code lines
            if code_part:
                code_lines.append(code_part)
        
        return '\n'.join(code_lines)
    
    def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
        """
        Create training examples from all available LODA programs.
        
        Args:
            max_examples: Maximum number of examples to create (-1 for all)
            
        Returns:
            List of TrainingExample objects
        """
        examples = []
        program_ids = self.program_cache.all_ids()
        
        if max_examples > 0:
            program_ids = program_ids[:max_examples]
        
        print(f"Processing {len(program_ids)} programs...")
        
        for i, program_id in enumerate(program_ids):
            if i % 1000 == 0:
                print(f"Processed {i}/{len(program_ids)} programs")
                
            try:
                # Read the program file
                program_path = self.program_cache.path(program_id)
                if not os.path.exists(program_path):
                    continue
                    
                with open(program_path, 'r') as f:
                    program_text = f.read()
                
                # Extract description
                description = self.extract_description_from_program(program_text)
                if not description:
                    continue
                
                # Extract terms (optional)
                terms = self.extract_terms_from_program(program_text)
                
                # Clean the LODA code
                clean_code = self.clean_loda_code(program_text)
                if not clean_code:
                    continue
                
                # Validate that the code parses correctly
                try:
                    Program(clean_code)
                except Exception:
                    continue  # Skip programs that don't parse
                
                example = TrainingExample(
                    sequence_id=str(program_id),
                    description=description,
                    loda_code=clean_code,
                    terms=terms
                )
                examples.append(example)
                
            except Exception as e:
                print(f"Error processing {program_id}: {e}")
                continue
        
        print(f"Created {len(examples)} training examples")
        return examples
    
    def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
        """
        Augment training examples with variations of descriptions.
        
        This can help make the model more robust to different phrasings.
        
        Args:
            examples: List of original training examples
            
        Returns:
            Augmented list with additional variations
        """
        augmented = list(examples)  # Start with originals
        
        for example in examples:
            desc = example.description
            
            # Create variations
            variations = []
            
            # Add "sequence of" prefix if not present
            if not desc.lower().startswith(('sequence', 'the sequence')):
                variations.append(f"Sequence of {desc.lower()}")
            
            # Add "Generate" prefix
            variations.append(f"Generate {desc.lower()}")
            
            # Add "Compute" prefix
            variations.append(f"Compute {desc.lower()}")
            
            # Remove mathematical symbols for simpler versions
            simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
            simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
            if simple_desc != desc and simple_desc:
                variations.append(simple_desc)
            
            # Create new examples for each variation
            for variation in variations:
                augmented_example = TrainingExample(
                    sequence_id=str(example.sequence_id) + "_aug",
                    description=variation,
                    loda_code=example.loda_code,
                    terms=example.terms
                )
                augmented.append(augmented_example)
        
        return augmented
    
    def save_dataset(self, examples: List[TrainingExample], output_file: str):
        """
        Save training examples to a file for later use.
        
        Args:
            examples: List of training examples
            output_file: Path to output file
        """
        import json
        
        data = []
        for example in examples:
            data.append({
                'sequence_id': example.sequence_id,
                'description': example.description,
                'loda_code': example.loda_code,
                'terms': example.terms
            })
        
        with open(output_file, 'w') as f:
            json.dump(data, f, indent=2)
        
        print(f"Saved {len(examples)} examples to {output_file}")
    
    def load_dataset(self, input_file: str) -> List[TrainingExample]:
        """
        Load training examples from a file.
        
        Args:
            input_file: Path to input file
            
        Returns:
            List of TrainingExample objects
        """
        import json
        
        with open(input_file, 'r') as f:
            data = json.load(f)
        
        examples = []
        for item in data:
            example = TrainingExample(
                sequence_id=item['sequence_id'],
                description=item['description'],
                loda_code=item['loda_code'],
                terms=item.get('terms')
            )
            examples.append(example)
        
        print(f"Loaded {len(examples)} examples from {input_file}")
        return examples

Handles preprocessing of OEIS programs for LLM training.

Initialize with path to OEIS programs directory.

Methods

def augment_descriptions(self, examples: List[TrainingExample]) ‑> List[TrainingExample]

Expand source code

def augment_descriptions(self, examples: List[TrainingExample]) -> List[TrainingExample]:
    """
    Augment training examples with variations of descriptions.
    
    This can help make the model more robust to different phrasings.
    
    Args:
        examples: List of original training examples
        
    Returns:
        Augmented list with additional variations
    """
    augmented = list(examples)  # Start with originals
    
    for example in examples:
        desc = example.description
        
        # Create variations
        variations = []
        
        # Add "sequence of" prefix if not present
        if not desc.lower().startswith(('sequence', 'the sequence')):
            variations.append(f"Sequence of {desc.lower()}")
        
        # Add "Generate" prefix
        variations.append(f"Generate {desc.lower()}")
        
        # Add "Compute" prefix
        variations.append(f"Compute {desc.lower()}")
        
        # Remove mathematical symbols for simpler versions
        simple_desc = re.sub(r'[()=+\-*/^]', ' ', desc)
        simple_desc = re.sub(r'\s+', ' ', simple_desc).strip()
        if simple_desc != desc and simple_desc:
            variations.append(simple_desc)
        
        # Create new examples for each variation
        for variation in variations:
            augmented_example = TrainingExample(
                sequence_id=str(example.sequence_id) + "_aug",
                description=variation,
                loda_code=example.loda_code,
                terms=example.terms
            )
            augmented.append(augmented_example)
    
    return augmented

Augment training examples with variations of descriptions.

This can help make the model more robust to different phrasings.

Args

examples: List of original training examples

Returns

Augmented list with additional variations

def clean_loda_code(self, program_text: str) ‑> str

Expand source code

def clean_loda_code(self, program_text: str) -> str:
    """
    Clean LODA code by removing comments and normalizing format.
    
    Args:
        program_text: Raw LODA program text
        
    Returns:
        Cleaned LODA code suitable for training
    """
    lines = program_text.strip().split('\n')
    code_lines = []
    
    for line in lines:
        # Skip comment lines (lines that start with ;)
        if line.strip().startswith(';'):
            continue
        # Skip empty lines
        if not line.strip():
            continue
        
        # Remove inline comments (everything after ; on the same line)
        if ';' in line:
            code_part = line.split(';')[0].strip()
        else:
            code_part = line.strip()
        
        # Only add non-empty code lines
        if code_part:
            code_lines.append(code_part)
    
    return '\n'.join(code_lines)

Clean LODA code by removing comments and normalizing format.

Args

program_text: Raw LODA program text

Returns

Cleaned LODA code suitable for training

def create_training_examples(self, max_examples: int = -1) ‑> List[TrainingExample]

Expand source code

def create_training_examples(self, max_examples: int = -1) -> List[TrainingExample]:
    """
    Create training examples from all available LODA programs.
    
    Args:
        max_examples: Maximum number of examples to create (-1 for all)
        
    Returns:
        List of TrainingExample objects
    """
    examples = []
    program_ids = self.program_cache.all_ids()
    
    if max_examples > 0:
        program_ids = program_ids[:max_examples]
    
    print(f"Processing {len(program_ids)} programs...")
    
    for i, program_id in enumerate(program_ids):
        if i % 1000 == 0:
            print(f"Processed {i}/{len(program_ids)} programs")
            
        try:
            # Read the program file
            program_path = self.program_cache.path(program_id)
            if not os.path.exists(program_path):
                continue
                
            with open(program_path, 'r') as f:
                program_text = f.read()
            
            # Extract description
            description = self.extract_description_from_program(program_text)
            if not description:
                continue
            
            # Extract terms (optional)
            terms = self.extract_terms_from_program(program_text)
            
            # Clean the LODA code
            clean_code = self.clean_loda_code(program_text)
            if not clean_code:
                continue
            
            # Validate that the code parses correctly
            try:
                Program(clean_code)
            except Exception:
                continue  # Skip programs that don't parse
            
            example = TrainingExample(
                sequence_id=str(program_id),
                description=description,
                loda_code=clean_code,
                terms=terms
            )
            examples.append(example)
            
        except Exception as e:
            print(f"Error processing {program_id}: {e}")
            continue
    
    print(f"Created {len(examples)} training examples")
    return examples

Create training examples from all available LODA programs.

Args

max_examples: Maximum number of examples to create (-1 for all)

Returns

List of TrainingExample objects

def extract_description_from_program(self, program_text: str) ‑> str | None

Expand source code

def extract_description_from_program(self, program_text: str) -> Optional[str]:
    """
    Extract the natural language description from a LODA program.
    
    LODA programs typically start with comments like:
    ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        The description string or None if no description found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for OEIS description lines (start with ; A######:)
        match = re.match(r';\s*A\d{6}:\s*(.+)', line)
        if match:
            description = match.group(1).strip()
            # Clean up common artifacts
            description = description.rstrip('.')
            # Remove mathematical notation that might be confusing
            # Keep it simple for initial training
            return description
            
    return None

Extract the natural language description from a LODA program.

LODA programs typically start with comments like: ; A000045: Fibonacci numbers: F(n) = F(n-1) + F(n-2) with F(0) = 0 and F(1) = 1.

Args

program_text: The full LODA program as text

Returns

The description string or None if no description found

def extract_terms_from_program(self, program_text: str) ‑> List[int] | None

Expand source code

def extract_terms_from_program(self, program_text: str) -> Optional[List[int]]:
    """
    Extract the sequence terms from a LODA program comment.
    
    Args:
        program_text: The full LODA program as text
        
    Returns:
        List of sequence terms or None if not found
    """
    lines = program_text.strip().split('\n')
    
    for line in lines:
        # Look for lines with comma-separated numbers (sequence terms)
        if line.startswith(';') and ',' in line:
            # Extract numbers from the line
            numbers_str = line[1:].strip()  # Remove the ';'
            # Skip if it looks like it contains non-numeric content
            if ':' in numbers_str or any(c.isalpha() for c in numbers_str):
                continue
                
            try:
                terms = [int(x.strip()) for x in numbers_str.split(',') if x.strip()]
                if len(terms) >= 5:  # Reasonable number of terms
                    return terms
            except ValueError:
                continue
                
    return None

Extract the sequence terms from a LODA program comment.

Args

program_text: The full LODA program as text

Returns

List of sequence terms or None if not found

def load_dataset(self, input_file: str) ‑> List[TrainingExample]

Expand source code

def load_dataset(self, input_file: str) -> List[TrainingExample]:
    """
    Load training examples from a file.
    
    Args:
        input_file: Path to input file
        
    Returns:
        List of TrainingExample objects
    """
    import json
    
    with open(input_file, 'r') as f:
        data = json.load(f)
    
    examples = []
    for item in data:
        example = TrainingExample(
            sequence_id=item['sequence_id'],
            description=item['description'],
            loda_code=item['loda_code'],
            terms=item.get('terms')
        )
        examples.append(example)
    
    print(f"Loaded {len(examples)} examples from {input_file}")
    return examples

Load training examples from a file.

Args

input_file: Path to input file

Returns

List of TrainingExample objects

def save_dataset(self, examples: List[TrainingExample], output_file: str)

Expand source code

def save_dataset(self, examples: List[TrainingExample], output_file: str):
    """
    Save training examples to a file for later use.
    
    Args:
        examples: List of training examples
        output_file: Path to output file
    """
    import json
    
    data = []
    for example in examples:
        data.append({
            'sequence_id': example.sequence_id,
            'description': example.description,
            'loda_code': example.loda_code,
            'terms': example.terms
        })
    
    with open(output_file, 'w') as f:
        json.dump(data, f, indent=2)
    
    print(f"Saved {len(examples)} examples to {output_file}")

Save training examples to a file for later use.

Args

examples: List of training examples
output_file: Path to output file

class TrainingExample (sequence_id: str, description: str, loda_code: str, terms: List[int] | None = None)

Expand source code

@dataclass
class TrainingExample:
    """A single training example pairing natural language with LODA code."""
    sequence_id: str
    description: str
    loda_code: str
    terms: Optional[List[int]] = None

A single training example pairing natural language with LODA code.

Instance variables

var description : str
var loda_code : str
var sequence_id : str
var terms : List[int] | None