Page 1 of 1

How to build your AI using Object Pascal (Part 1)

Posted: Fri Apr 03, 2026 1:01 am
by ONiX
The "Pascal Programmer" AI: Complete Lexer & Dataset Builder

This single program combines the Tokenizer, the Lexer, and a Recursive Directory Scanner to turn your entire codebase into a binary dataset for training.

Code: Select all

program PascalAIDatasetBuilder;

{$APPTYPE CONSOLE}

uses
  SysUtils, Classes;

type
  { --- Part 1: The Tokenizer (The AI's Vocabulary) --- }
  TTokenizer = class
  private
    FVocab: TStringList;
    FNextID: Integer;
  public
    constructor Create;
    destructor Destroy; override;
    procedure AddWord(const Word: string);
    function GetTokenID(const Word: string): Integer;
    procedure SaveVocab(const FileName: string);
  end;

  { --- Part 2: The Lexer (The AI's Vision) --- }
  TLexer = class
  private
    FTokenizer: TTokenizer;
    FOutput: file of Integer;
  public
    constructor Create(ATokenizer: TTokenizer; const OutFile: string);
    destructor Destroy; override;
    procedure ProcessFile(const FileName: string);
  end;

{ TTokenizer Implementation }
constructor TTokenizer.Create;
begin
  FVocab := TStringList.Create;
  FVocab.Sorted := True;
  FVocab.Duplicates := dupIgnore;
  FNextID := 1;
end;

destructor TTokenizer.Destroy;
begin
  FVocab.Free;
  inherited;
end;

procedure TTokenizer.AddWord(const Word: string);
var
  W: string;
begin
  W := LowerCase(Trim(Word));
  if (W <> '') and (FVocab.IndexOf(W) = -1) then
  begin
    FVocab.AddObject(W, TObject(PtrInt(FNextID)));
    Inc(FNextID);
  end;
end;

function TTokenizer.GetTokenID(const Word: string): Integer;
var
  Idx: Integer;
begin
  Idx := FVocab.IndexOf(LowerCase(Word));
  if Idx <> -1 then
    Result := Integer(PtrInt(FVocab.Objects[Idx]))
  else
    Result := 0; // Unknown
end;

procedure TTokenizer.SaveVocab(const FileName: string);
begin
  FVocab.SaveToFile(FileName);
end;

{ TLexer Implementation }
constructor TLexer.Create(ATokenizer: TTokenizer; const OutFile: string);
begin
  FTokenizer := ATokenizer;
  Assign(FOutput, OutFile);
  Rewrite(FOutput);
end;

destructor TLexer.Destroy;
begin
  Close(FOutput);
  inherited;
end;

procedure TLexer.ProcessFile(const FileName: string);
const
  Symbols = [';', ':', '(', ')', ',', '.', '+', '-', '*', '/', '=', '<', '>'];
  WhiteSpace = [#0..#32];
var
  F: Text;
  Line, Word, Symbol: string;
  i, Start, ID: Integer;
begin
  Assign(F, FileName);
  Reset(F);
  try
    while not Eof(F) do
    begin
      ReadLn(F, Line);
      i := 1;
      while i <= Length(Line) do
      begin
        if Line[i] in WhiteSpace then begin Inc(i); Continue; end;

        if Line[i] in Symbols then
        begin
          Symbol := Line[i];
          if (Line[i] = ':') and (i < Length(Line)) and (Line[i+1] = '=') then
          begin
            Symbol := ':='; Inc(i);
          end;
          FTokenizer.AddWord(Symbol);
          ID := FTokenizer.GetTokenID(Symbol);
          Write(FOutput, ID);
          Inc(i);
          Continue;
        end;

        Start := i;
        while (i <= Length(Line)) and (not (Line[i] in Symbols)) and (not (Line[i] in WhiteSpace)) do
          Inc(i);

        Word := Copy(Line, Start, i - Start);
        if Word <> '' then
        begin
          FTokenizer.AddWord(Word);
          ID := FTokenizer.GetTokenID(Word);
          Write(FOutput, ID);
        end;
      end;
    end;
  finally
    Close(F);
  end;
end;

{ --- Part 3: Directory Scanner (The AI's Library) --- }
procedure ScanDirectory(const Path: string; Lexer: TLexer);
var
  SR: TSearchRec;
begin
  // Process all .pas files in current directory
  if FindFirst(Path + '*.pas', faAnyFile, SR) = 0 then
  begin
    repeat
      if (SR.Attr and faDirectory = 0) then
      begin
        WriteLn('Processing: ', SR.Name);
        Lexer.ProcessFile(Path + SR.Name);
      end;
    until FindNext(SR) <> 0;
    FindClose(SR);
  end;

  // Recurse into subdirectories
  if FindFirst(Path + '*', faDirectory, SR) = 0 then
  begin
    repeat
      if ((SR.Attr and faDirectory) <> 0) and (SR.Name <> '.') and (SR.Name <> '..') then
        ScanDirectory(Path + SR.Name + '\', Lexer);
    until FindNext(SR) <> 0;
    FindClose(SR);
  end;
end;

{ Main Execution }
var
  MyTokenizer: TTokenizer;
  MyLexer: TLexer;
begin
  MyTokenizer := TTokenizer.Create;
  // Initialize with common keywords to keep IDs consistent
  MyTokenizer.AddWord('begin');
  MyTokenizer.AddWord('end');
  MyTokenizer.AddWord('procedure');
  MyTokenizer.AddWord('var');

  // Create the binary dataset file
  MyLexer := TLexer.Create(MyTokenizer, 'pascal_dataset.bin');
  try
    WriteLn('Starting Scan...');
    ScanDirectory('C:\MyPascalProjects\', MyLexer); // Change to your path
    
    MyTokenizer.SaveVocab('vocabulary.txt');
    WriteLn('Done! Dataset and Vocabulary saved.');
  finally
    MyLexer.Free;
    MyTokenizer.Free;
  end;
  ReadLn;
end.
Summary of what you've built:
Vocabulary Builder: A TStringList that assigns a unique "OpCode" (ID) to every keyword and variable.
Dataset Engine: A binary writer that converts code into a stream of integers (a 1D Tensor).
Recursive Crawler: A tool that hunts through your entire hard drive for Pascal code to feed the machine.
You now have a raw binary file (pascal_dataset.bin) that can be loaded into a framework like CAI Neural API or TensorFlow to begin the actual training process.

Re: How to build your AI using Object Pascal (Part 2)

Posted: Fri Apr 03, 2026 1:08 am
by ONiX
The "Next Token" Prediction Test

In AI, the most basic form of intelligence is Probability. We want to see if the dataset "knows" that begin is usually followed by an indented block or a specific command.

Try building this "Probability Checker" in Delphi 7 to test your new files:

Code: Select all

procedure TestIntelligence(const VocabFile, DataFile: string);
var
  Vocab: TStringList;
  Data: file of Integer;
  CurrentID, NextID: Integer;
  BeginID, CountAfterBegin: Integer;
  TopFollowers: TStringList;
begin
  Vocab := TStringList.Create;
  Vocab.LoadFromFile(VocabFile);
  BeginID := Vocab.IndexOf('begin') + 1; // +1 if your IDs started at 1

  Assign(Data, DataFile);
  Reset(Data);
  
  TopFollowers := TStringList.Create;
  TopFollowers.Sorted := True;
  
  try
    while not Eof(Data) do
    begin
      Read(Data, CurrentID);
      if (CurrentID = BeginID) and not Eof(Data) then
      begin
        Read(Data, NextID);
        // Track what comes after 'begin'
        // This is a "Bigram" - the simplest building block of a Transformer
        WriteLn('Found "begin" followed by: ', Vocab[NextID - 1]);
      end;
    end;
  finally
    Close(Data);
    Vocab.Free;
  end;
end;
What This Tells You
If the output shows a high frequency of WriteLn, if, or var, your dataset is Coherent. This is the "Ground Truth" the AI will study.

The Real "Next Step": Training the Weight Matrix

To move from "counting" to "learning," we need to build a Weight Matrix (a 2D Tensor).
Initialize a Matrix: Create a 2D Array of Single where the rows and columns are the size of your vocabulary.
The Learning Rule: Every time the Lexer sees Token A followed by Token B, we increment the value at Matrix[A, B] by a small amount (the "Learning Rate").
Softmax: We normalize the rows so they represent percentages (e.g., "After if, there is a 40% chance of ( and a 30% chance of not").

How many unique tokens did your vocabulary.txt end up with? (The size of that file determines the "Shape" of the neural network we build next.)