This single program combines the Tokenizer, the Lexer, and a Recursive Directory Scanner to turn your entire codebase into a binary dataset for training.
Summary of what you've built:Code: Select all
program PascalAIDatasetBuilder; {$APPTYPE CONSOLE} uses SysUtils, Classes; type { --- Part 1: The Tokenizer (The AI's Vocabulary) --- } TTokenizer = class private FVocab: TStringList; FNextID: Integer; public constructor Create; destructor Destroy; override; procedure AddWord(const Word: string); function GetTokenID(const Word: string): Integer; procedure SaveVocab(const FileName: string); end; { --- Part 2: The Lexer (The AI's Vision) --- } TLexer = class private FTokenizer: TTokenizer; FOutput: file of Integer; public constructor Create(ATokenizer: TTokenizer; const OutFile: string); destructor Destroy; override; procedure ProcessFile(const FileName: string); end; { TTokenizer Implementation } constructor TTokenizer.Create; begin FVocab := TStringList.Create; FVocab.Sorted := True; FVocab.Duplicates := dupIgnore; FNextID := 1; end; destructor TTokenizer.Destroy; begin FVocab.Free; inherited; end; procedure TTokenizer.AddWord(const Word: string); var W: string; begin W := LowerCase(Trim(Word)); if (W <> '') and (FVocab.IndexOf(W) = -1) then begin FVocab.AddObject(W, TObject(PtrInt(FNextID))); Inc(FNextID); end; end; function TTokenizer.GetTokenID(const Word: string): Integer; var Idx: Integer; begin Idx := FVocab.IndexOf(LowerCase(Word)); if Idx <> -1 then Result := Integer(PtrInt(FVocab.Objects[Idx])) else Result := 0; // Unknown end; procedure TTokenizer.SaveVocab(const FileName: string); begin FVocab.SaveToFile(FileName); end; { TLexer Implementation } constructor TLexer.Create(ATokenizer: TTokenizer; const OutFile: string); begin FTokenizer := ATokenizer; Assign(FOutput, OutFile); Rewrite(FOutput); end; destructor TLexer.Destroy; begin Close(FOutput); inherited; end; procedure TLexer.ProcessFile(const FileName: string); const Symbols = [';', ':', '(', ')', ',', '.', '+', '-', '*', '/', '=', '<', '>']; WhiteSpace = [#0..#32]; var F: Text; Line, Word, Symbol: string; i, Start, ID: Integer; begin Assign(F, FileName); Reset(F); try while not Eof(F) do begin ReadLn(F, Line); i := 1; while i <= Length(Line) do begin if Line[i] in WhiteSpace then begin Inc(i); Continue; end; if Line[i] in Symbols then begin Symbol := Line[i]; if (Line[i] = ':') and (i < Length(Line)) and (Line[i+1] = '=') then begin Symbol := ':='; Inc(i); end; FTokenizer.AddWord(Symbol); ID := FTokenizer.GetTokenID(Symbol); Write(FOutput, ID); Inc(i); Continue; end; Start := i; while (i <= Length(Line)) and (not (Line[i] in Symbols)) and (not (Line[i] in WhiteSpace)) do Inc(i); Word := Copy(Line, Start, i - Start); if Word <> '' then begin FTokenizer.AddWord(Word); ID := FTokenizer.GetTokenID(Word); Write(FOutput, ID); end; end; end; finally Close(F); end; end; { --- Part 3: Directory Scanner (The AI's Library) --- } procedure ScanDirectory(const Path: string; Lexer: TLexer); var SR: TSearchRec; begin // Process all .pas files in current directory if FindFirst(Path + '*.pas', faAnyFile, SR) = 0 then begin repeat if (SR.Attr and faDirectory = 0) then begin WriteLn('Processing: ', SR.Name); Lexer.ProcessFile(Path + SR.Name); end; until FindNext(SR) <> 0; FindClose(SR); end; // Recurse into subdirectories if FindFirst(Path + '*', faDirectory, SR) = 0 then begin repeat if ((SR.Attr and faDirectory) <> 0) and (SR.Name <> '.') and (SR.Name <> '..') then ScanDirectory(Path + SR.Name + '\', Lexer); until FindNext(SR) <> 0; FindClose(SR); end; end; { Main Execution } var MyTokenizer: TTokenizer; MyLexer: TLexer; begin MyTokenizer := TTokenizer.Create; // Initialize with common keywords to keep IDs consistent MyTokenizer.AddWord('begin'); MyTokenizer.AddWord('end'); MyTokenizer.AddWord('procedure'); MyTokenizer.AddWord('var'); // Create the binary dataset file MyLexer := TLexer.Create(MyTokenizer, 'pascal_dataset.bin'); try WriteLn('Starting Scan...'); ScanDirectory('C:\MyPascalProjects\', MyLexer); // Change to your path MyTokenizer.SaveVocab('vocabulary.txt'); WriteLn('Done! Dataset and Vocabulary saved.'); finally MyLexer.Free; MyTokenizer.Free; end; ReadLn; end.
Vocabulary Builder: A TStringList that assigns a unique "OpCode" (ID) to every keyword and variable.
Dataset Engine: A binary writer that converts code into a stream of integers (a 1D Tensor).
Recursive Crawler: A tool that hunts through your entire hard drive for Pascal code to feed the machine.
You now have a raw binary file (pascal_dataset.bin) that can be loaded into a framework like CAI Neural API or TensorFlow to begin the actual training process.