import java.util.*;
import java.util.stream.Collectors;
/**
* MiniGPT.java — a didactic, single-file miniature GPT-like Transformer in pure Java.
* - Tokenizer: whitespace tokenizer with on-the-fly vocab from a tiny corpus
* - Embeddings: token + sinusoidal positional encodings
* - Architecture: [LayerNorm → MultiHeadSelfAttention → residual] + [LayerNorm → FFN → residual]
* - Causal mask for autoregressive prediction
* - Greedy decoding loop (argmax) for demonstration
*
* NOTE: Random weights => gibberish text. This is for structure & learning only.
*/
public class MiniGPT {
/* ===========================
Hyperparameters (toy)
=========================== */
static final int D_MODEL = 64; // embedding size
static final int NUM_HEADS = 4; // attention heads
static final int D_HEAD = D_MODEL / NUM_HEADS;
static final int D_FF = 4 * D_MODEL; // FFN hidden size
static final int MAX_LEN = 64; // max context length
static final int NUM_LAYERS = 2; // transformer blocks
static final Random RNG = new Random(42);
/* ===========================
Tokenizer (whitespace/bag-of-words)
=========================== */
static class Tokenizer {
Map<String, Integer> toId = new HashMap<>();
Map<Integer, String> toTok = new HashMap<>();
int padId, eosId, unkId;
Tokenizer(String corpus) {
// Basic vocab from corpus + special tokens
Set<String> vocab = new LinkedHashSet<>();
Collections.addAll(vocab, "<PAD>", "<EOS>", "<UNK>");
for (String w : corpus.split("\\s+")) {
if (!w.isEmpty()) vocab.add(normalize(w));
}
int idx = 0;
for (String t : vocab) {
toId.put(t, idx);
toTok.put(idx, t);
idx++;
}
padId = toId.get("<PAD>");
eosId = toId.get("<EOS>");
unkId = toId.get("<UNK>");
}
String normalize(String s) {
return s.toLowerCase();
}
int[] encode(String text, int maxLen) {
String[] toks = text.trim().isEmpty() ? new String[0] : text.split("\\s+");
int[] out = new int[Math.min(maxLen, toks.length + 1)];
int i = 0;
for (; i < out.length - 1 && i < toks.length; i++) {
String n = normalize(toks[i]);
out[i] = toId.getOrDefault(n, unkId);
}
// add <EOS> if room
if (i < out.length) out[i] = toId.get("<EOS>");
return out;
}
String decode(List<Integer> ids) {
return ids.stream().map(i -> toTok.getOrDefault(i, "<UNK>"))
.collect(Collectors.joining(" "));
}
int vocabSize() { return toId.size(); }
}
/* ===========================
Utilities (linear algebra)
=========================== */
static float[][] zeros(int rows, int cols) {
float[][] m = new float[rows][cols];
return m;
}
static float[] zeros(int n) { return new float[n]; }
static float[][] randn(int rows, int cols, float scale) {
float[][] m = new float[rows][cols];
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
m[i][j] = (float) (RNG.nextGaussian() * scale);
return m;
}
static float[] randn(int n, float scale) {
float[] v = new float[n];
for (int i = 0; i < n; i++) v[i] = (float) (RNG.nextGaussian() * scale);
return v;
}
static float[] add(float[] a, float[] b) {
float[] out = new float[a.length];
for (int i = 0; i < a.length; i++) out[i] = a[i] + b[i];
return out;
}
static void addInPlace(float[] a, float[] b) {
for (int i = 0; i < a.length; i++) a[i] += b[i];
}
static float[] matvec(float[][] W, float[] x) {
int r = W.length, c = W[0].length;
float[] y = new float[r];
for (int i = 0; i < r; i++) {
float sum = 0f;
for (int j = 0; j < c; j++) sum += W[i][j] * x[j];
y[i] = sum;
}
return y;
}
static float[][] softmaxRows(float[][] logits) {
int T = logits.length;
int C = logits[0].length;
float[][] out = new float[T][C];
for (int t = 0; t < T; t++) {
float max = -Float.MAX_VALUE;
for (int c = 0; c < C; c++) max = Math.max(max, logits[t][c]);
float sum = 0f;
for (int c = 0; c < C; c++) {
out[t][c] = (float) Math.exp(logits[t][c] - max);
sum += out[t][c];
}
for (int c = 0; c < C; c++) out[t][c] /= sum;
}
return out;
}
static int argmax(float[] v) {
int idx = 0; float best = v[0];
for (int i = 1; i < v.length; i++) if (v[i] > best) { best = v[i]; idx = i; }
return idx;
}
[/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i][/i]
/* ===========================
Embeddings + Positional Encoding
=========================== */
static class Embeddings {
final float[][] tokenTable; // [Vocab, D_MODEL]
Embeddings(int vocab, int d) {
// Standard init ~ N(0, 1/sqrt(d))
tokenTable = randn(vocab, d, (float)(1.0 / Math.sqrt(d)));
}
float[] lookup(int tokenId) {
return Arrays.copyOf(tokenTable[tokenId], tokenTable[tokenId].length);
}
}
static class PositionalEncoding {
final float[][] table; // [MAX_LEN, D_MODEL]
PositionalEncoding(int maxLen, int dModel) {
table = new float[maxLen][dModel];
for (int pos = 0; pos < maxLen; pos++) {
for (int i = 0; i < dModel; i++) {
double angle = pos / Math.pow(10000.0, (2 * (i / 2)) / (double) dModel);
if (i % 2 == 0) table[pos][i] = (float) Math.sin(angle);
else table[pos][i] = (float) Math.cos(angle);
}
}
}
void addInPlace(float[] x, int pos) {
for (int i = 0; i < x.length; i++) x[i] += table[pos][i];
}
}
[/i][/i][/i][/i]
/* ===========================
LayerNorm
=========================== */
static class LayerNorm {
final int d;
final float[] gamma, beta;
final float eps = 1e-5f;
LayerNorm(int d) {
this.d = d;
this.gamma = new float[d];
this.beta = new float[d];
Arrays.fill(gamma, 1f);
Arrays.fill(beta, 0f);
}
float[] forward(float[] x) {
float mean = 0f, var = 0f;
for (float v : x) mean += v;
mean /= d;
for (float v : x) { float dv = v - mean; var += dv * dv; }
var /= d;
float inv = (float) (1.0 / Math.sqrt(var + eps));
float[] y = new float[d];
for (int i = 0; i < d; i++) {
y[i] = ((x[i] - mean) * inv) * gamma[i] + beta[i];
}
return y;
}
}
/* ===========================
Multi-Head Self-Attention (causal)
=========================== */
static class MultiHeadSelfAttention {
final int dModel, numHeads, dHead;
final float[][] Wq, Wk, Wv, Wo;
final float[] bo, bq, bk, bv;
MultiHeadSelfAttention(int dModel, int numHeads) {
this.dModel = dModel;
this.numHeads = numHeads;
this.dHead = dModel / numHeads;
float scale = (float)(1.0 / Math.sqrt(dModel));
Wq = randn(dModel, dModel, scale);
Wk = randn(dModel, dModel, scale);
Wv = randn(dModel, dModel, scale);
Wo = randn(dModel, dModel, scale);
bo = randn(dModel, scale);
bq = randn(dModel, scale);
bk = randn(dModel, scale);
bv = randn(dModel, scale);
}
// X: [T, D_MODEL]
float[][] forward(float[][] X) {
int T = X.length;
// Project to Q, K, V
float[][] Q = new float[T][dModel];
float[][] K = new float[T][dModel];
float[][] V = new float[T][dModel];
for (int t = 0; t < T; t++) {
Q[t] = add(matvec(Wq, X[t]), bq);
K[t] = add(matvec(Wk, X[t]), bk);
V[t] = add(matvec(Wv, X[t]), bv);
}
// Split heads: [T, H, dHead]
float[][][] Qh = splitHeads(Q);
float[][][] Kh = splitHeads(K);
float[][][] Vh = splitHeads(V);
// Scaled dot-product attention per head with causal mask
float[][][] outHeads = new float[numHeads][T][dHead];
for (int h = 0; h < numHeads; h++) {
// scores: [T, T]
float[][] scores = new float[T][T];
for (int i = 0; i < T; i++) {
for (int j = 0; j < T; j++) {
float dot = 0f;
for (int k = 0; k < dHead; k++) dot += Qh[h][i][k] * Kh[h][j][k];
scores[i][j] = (float) (dot / Math.sqrt(dHead));
// causal mask: disallow attending to future
if (j > i) scores[i][j] = -1e9f;
}
}
float[][] attn = softmaxRows(scores); // [T, T]
// context: [T, dHead] = attn @ Vh
float[][] ctx = new float[T][dHead];
for (int i = 0; i < T; i++) {
for (int j = 0; j < T; j++) {
float a = attn[i][j];
for (int k = 0; k < dHead; k++) ctx[i][k] += a * Vh[h][j][k];
}
}
outHeads[h] = ctx;
}
// Concatenate heads -> [T, D_MODEL]
float[][] concat = new float[T][dModel];
for (int t = 0; t < T; t++) {
int off = 0;
for (int h = 0; h < numHeads; h++) {
for (int k = 0; k < dHead; k++) concat[t][off++] = outHeads[h][t][k];
}
}
// Output projection
float[][] Y = new float[T][dModel];
for (int t = 0; t < T; t++) {
Y[t] = add(matvec(Wo, concat[t]), bo);
}
return Y;
}
float[][][] splitHeads(float[][] X) {
int T = X.length;
float[][][] out = new float[numHeads][T][dHead];
for (int t = 0; t < T; t++) {
for (int h = 0; h < numHeads; h++) {
System.arraycopy(X[t], h * dHead, out[h][t], 0, dHead);
}
}
return out;
}
}
[/h][/h][/h][/h][/i][/i][/i][/i][/h][/i][/h][/i][/i][/i][/i]
/* ===========================
Position-wise FeedForward
=========================== */
static class FeedForward {
final float[][] W1, W2;
final float[] b1, b2;
FeedForward(int dModel, int dFF) {
float s1 = (float)(1.0 / Math.sqrt(dModel));
float s2 = (float)(1.0 / Math.sqrt(dFF));
W1 = randn(dFF, dModel, s1);
W2 = randn(dModel, dFF, s2);
b1 = randn(dFF, s1);
b2 = randn(dModel, s2);
}
float gelu(float x) {
// Approximate GELU
return (float)(0.5 * x * (1.0 + Math.tanh(Math.sqrt(2 / Math.PI) * (x + 0.044715 * Math.pow(x, 3)))));
}
float[] forward(float[] x) {
// y = W2 * GELU(W1 * x + b1) + b2
float[] h = add(matvec(W1, x), b1);
for (int i = 0; i < h.length; i++) h[i] = gelu(h[i]);
float[] y = add(matvec(W2, h), b2);
return y;
}
float[][] forwardBatch(float[][] X) {
int T = X.length;
float[][] Y = new float[T][X[0].length];
for (int t = 0; t < T; t++) Y[t] = forward(X[t]);
return Y;
}
}
/* ===========================
Transformer Block
=========================== */
static class TransformerBlock {
final LayerNorm ln1, ln2;
final MultiHeadSelfAttention attn;
final FeedForward ffn;
TransformerBlock(int dModel, int numHeads, int dFF) {
ln1 = new LayerNorm(dModel);
ln2 = new LayerNorm(dModel);
attn = new MultiHeadSelfAttention(dModel, numHeads);
ffn = new FeedForward(dModel, dFF);
}
float[][] forward(float[][] X) {
// Pre-norm
int T = X.length;
float[][] X1 = new float[T][X[0].length];
for (int t = 0; t < T; t++) X1[t] = ln1.forward(X[t]);
// Self-attention + residual
float[][] A = attn.forward(X1);
float[][] R1 = new float[T][X[0].length];
for (int t = 0; t < T; t++) {
R1[t] = Arrays.copyOf(X[t], X[t].length);
addRowInPlace(R1[t], A[t]);
}
// Pre-norm again
float[][] R1n = new float[T][X[0].length];
for (int t = 0; t < T; t++) R1n[t] = ln2.forward(R1[t]);
// FFN + residual
float[][] F = ffn.forwardBatch(R1n);
float[][] R2 = new float[T][X[0].length];
for (int t = 0; t < T; t++) {
R2[t] = Arrays.copyOf(R1[t], R1[t].length);
addRowInPlace(R2[t], F[t]);
}
return R2;
}
void addRowInPlace(float[] a, float[] b) {
for (int i = 0; i < a.length; i++) a[i] += b[i];
}
}
/* ===========================
Language head (to logits)
=========================== */
static class LMHead {
final float[][] W; // [Vocab, D_MODEL]
final float[] b;
LMHead(int vocab, int dModel) {
float scale = (float)(1.0 / Math.sqrt(dModel));
W = randn(vocab, dModel, scale);
b = randn(vocab, scale);
}
// Takes last hidden state h_T -> logits over vocab
float[] logits(float[] h) {
float[] out = new float[W.length];
for (int i = 0; i < W.length; i++) {
float sum = b[i];
for (int j = 0; j < h.length; j++) sum += W[i][j] * h[j];
out[i] = sum;
}
return out;
}
}
[/i][/i][/i][/i][/i][/i][/i]
/* ===========================
MiniGPT Model
=========================== */
static class MiniGPTModel {
final Tokenizer tok;
final Embeddings emb;
final PositionalEncoding pos;
final TransformerBlock[] blocks;
final LayerNorm lnFinal;
final LMHead head;
MiniGPTModel(Tokenizer tok) {
this.tok = tok;
this.emb = new Embeddings(tok.vocabSize(), D_MODEL);
this.pos = new PositionalEncoding(MAX_LEN, D_MODEL);
this.blocks = new TransformerBlock[NUM_LAYERS];
for (int i = 0; i < NUM_LAYERS; i++) {
blocks[i] = new TransformerBlock(D_MODEL, NUM_HEADS, D_FF);
}
this.lnFinal = new LayerNorm(D_MODEL);
this.head = new LMHead(tok.vocabSize(), D_MODEL);
}
// Encode token ids to hidden states [T, D_MODEL]
float[][] encode(int[] ids) {
int T = Math.min(ids.length, MAX_LEN);
float[][] X = new float[T][D_MODEL];
for (int t = 0; t < T; t++) {
X[t] = emb.lookup(ids[t]);
pos.addInPlace(X[t], t);
}
return X;
}
// Forward through blocks, return logits for next token (from last position)
float[] forwardNextLogits(int[] ids) {
float[][] X = encode(ids);
for (TransformerBlock block : blocks) X = block.forward(X);
// final layer norm
int last = X.length - 1;
float[] hLast = lnFinal.forward(X[last]);
return head.logits(hLast);
}
// Greedy generation (argmax); maxNewTokens is small to avoid long runs
List<Integer> generate(List<Integer> context, int maxNewTokens) {
ArrayList<Integer> out = new ArrayList<>(context);
for (int step = 0; step < maxNewTokens; step++) {
int start = Math.max(0, out.size() - (MAX_LEN - 1));
List<Integer> window = out.subList(start, out.size());
int[] ids = new int[window.size() + 1];
for (int i = 0; i < window.size(); i++) ids[i] = window.get(i);
// For next-token prediction, include a dummy last position (repeat last token)
ids[ids.length - 1] = window.get(window.size() - 1);
float[] logits = forwardNextLogits(ids);
int next = argmax(logits);
out.add(next);
if (next == tok.eosId) break;
}
return out;
}
}
/* ===========================
Demo main()
=========================== */
public static void main(String[] args) {
// Tiny "corpus" to build a vocab for the toy tokenizer
String corpus = """
<PAD> <EOS> <UNK> hello world this is a tiny miniature gpt transformer
it has attention heads tokens and positions
deep learning still runs on code in java example demo prompt response
""";
Tokenizer tokenizer = new Tokenizer(corpus);
MiniGPTModel model = new MiniGPTModel(tokenizer);
String prompt = "hello world this is";
int[] enc = tokenizer.encode(prompt, Math.min(MAX_LEN - 1, 16));
List<Integer> ctx = new ArrayList<>();
for (int id : enc) ctx.add(id);
List<Integer> gen = model.generate(ctx, 12); // generate up to 12 new tokens
// Show only the newly generated tokens (after the original context)
List<Integer> newTokens = gen.subList(ctx.size(), gen.size());
System.out.println("Prompt: " + prompt);
System.out.println("Generated token IDs: " + newTokens);
System.out.println("Generated text: " + tokenizer.decode(newTokens));
System.out.println("\n(Weights are random, so output will be nonsense—structure is the learning goal.)");
}
}
[/i][/i](Edited 13 seconds later.)
# Likely outside the function.
$autoReply = true; # The AI chat bot always on.
$p2pChat = false; # Hidden peer-to-peer chat interface.
# Calls subroutine to activate lightSwitch.
my $tgle = toggle-tether();
print($tgle);
# The subroutine itself.
sub toggle-tether() {
$lightSwitch = <STDIN>; # Prompts developer.
if(lightSwitch = "on") { # The switch.
$autoReply = false;
$p2pChat = true;
}
else {
print("All it would take is just one big If.");
}
}
=Purposes: Security, testing, and trolling;
last one seems like a stretch, but you never know.my $tosNotification = "This response violates our terms of service policies."
my $tosP = tosPopup();
print($tosP);
sub tosPopup() {
tosToggle = <STDIN>;
if(tosToggle = "now") {
$tosNotification = true;
}
else {
$tosNotification = false;
}
}(Edited 13 seconds later.)
(Edited 30 seconds later.)
(Edited 17 seconds later.)