      # hyperparameters for our GPT
      # vocab size is 2, so we only have two possible tokens: 0,1
      vocab_size = 2
      # context length is 3, so we take 3 bits to predict the next bit probability
      context_length = 3


        print('state space (for this exercise) = ', vocab_size ** context_length)



          #@title minimal GPT implementation in PyTorch (optional)
          """ super minimal decoder-only gpt """
          import math
          from dataclasses import dataclass
          import torch
          import torch.nn as nn
          from torch.nn import functional as F
          class CausalSelfAttention(nn.Module):
            def __init__(self, config):
              assert config.n_embd % config.n_head == 0
              # key, query, value projections for all heads, but in a batch
              self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
              # output projection
              self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
              # regularization
              self.n_head = config.n_head
              self.n_embd = config.n_embd
              self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
              .view(1, 1, config.block_size, config.block_size))
            def forward(self, x):
              B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
              # calculate query, key, values for all heads in batch and move head forward to be the batch dim
              q, k ,v  = self.c_attn(x).split(self.n_embd, dim=2)
              k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
              q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
              v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
              # manual implementation of attention
              att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
              att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
              att = F.softmax(att, dim=-1)
              y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
              y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
              # output projection
              y = self.c_proj(y)
              return y
          class MLP(nn.Module):
            def __init__(self, config):
              self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
              self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
              self.nonlin = nn.GELU()
            def forward(self, x):
              x = self.c_fc(x)
              x = self.nonlin(x)
              x = self.c_proj(x)
              return x
          class Block(nn.Module):
            def __init__(self, config):
              self.ln_1 = nn.LayerNorm(config.n_embd)
              self.attn = CausalSelfAttention(config)
              self.ln_2 = nn.LayerNorm(config.n_embd)
              self.mlp = MLP(config)
            def forward(self, x):
              x = x + self.attn(self.ln_1(x))
              x = x + self.mlp(self.ln_2(x))
              return x
          class GPTConfig:
            # these are default GPT-2 hyperparameters
            block_size: int = 1024
            vocab_size: int = 50304
            n_layer: int = 12
            n_head: int = 12
            n_embd: int = 768
            bias: bool = False
          class GPT(nn.Module):
            def __init__(self, config):
              assert config.vocab_size is not None
              assert config.block_size is not None
              self.config = config
              self.transformer = nn.ModuleDict(dict(
              wte = nn.Embedding(config.vocab_size, config.n_embd),
              wpe = nn.Embedding(config.block_size, config.n_embd),
              h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
              ln_f = nn.LayerNorm(config.n_embd),
              self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
              self.transformer.wte.weight = self.lm_head.weight #
              # init all weights
              # apply special scaled init to the residual projections, per GPT-2 paper
              for pn, p in self.named_parameters():
              if pn.endswith('c_proj.weight'):
              torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
              # report number of parameters
              print("number of parameters: %d" % (sum(p.nelement() for p in self.parameters()),))
            def _init_weights(self, module):
              if isinstance(module, nn.Linear):
              torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
              if module.bias is not None:
              elif isinstance(module, nn.Embedding):
              torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            def forward(self, idx):
              device = idx.device
              b, t = idx.size()
              assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
              pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0) # shape (1, t)
              # forward the GPT model itself
              tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
              pos_emb = self.transformer.wpe(pos) # position embeddings of shape (1, t, n_embd)
              x = tok_emb + pos_emb
              for block in self.transformer.h:
              x = block(x)
              x = self.transformer.ln_f(x)
              logits = self.lm_head(x[:, -1, :]) # note: only returning logits at the last time step (-1), output is 2D (b, vocab_size)
              return logits


            config = GPTConfig(
              block_size = context_length,
              vocab_size = vocab_size,
              n_layer = 4,
              n_head = 4,
              n_embd = 16,
              bias = False,
            gpt = GPT(config)
            number of parameters: 12656




              def all_possible(n, k):
                # return all possible lists of k elements, each in range of [0,n)
                if k == 0:
                  yield []
                  for i in range(n):
                  for c in all_possible(n, k - 1):
                  yield [i] + c
              list(all_possible(vocab_size, context_length)
              [[0, 0, 0],
               [0, 0, 1],
               [0, 1, 0],
               [0, 1, 1],
               [1, 0, 0],
               [1, 0, 1],
               [1, 1, 0],
               [1, 1, 1]]


                # we'll use graphviz for pretty plotting the current state of the GPT
                from graphviz import Digraph
                def plot_model():
                  dot = Digraph(comment='Baby GPT', engine='circo')
                  for xi in all_possible(gpt.config.vocab_size, gpt.config.block_size):
                    # forward the GPT and get probabilities for next token
                    x = torch.tensor(xi, dtype=torch.long)[None, ...] # turn the list into a torch tensor and add a batch dimension
                    logits = gpt(x) # forward the gpt neural net
                    probs = nn.functional.softmax(logits, dim=-1) # get the probabilities
                    y = probs[0].tolist() # remove the batch dimension and unpack the tensor into simple list
                    print(f"input {xi} ---> {y}")
                    # also build up the transition graph for plotting later
                    current_node_signature = "".join(str(d) for d in xi)
                    for t in range(gpt.config.vocab_size):
                      next_node = xi[1:] + [t] # crop the context and append the next character
                      next_node_signature = "".join(str(d) for d in next_node)
                      p = y[t]
                      dot.edge(current_node_signature, next_node_signature, label=label)
                  return dot
                input [0, 0, 0] ---> [0.4963349997997284, 0.5036649107933044]
                input [0, 0, 1] ---> [0.4515703618526459, 0.5484296679496765]
                input [0, 1, 0] ---> [0.49648362398147583, 0.5035163760185242]
                input [0, 1, 1] ---> [0.45181113481521606, 0.5481888651847839]
                input [1, 0, 0] ---> [0.4961162209510803, 0.5038837194442749]
                input [1, 0, 1] ---> [0.4517717957496643, 0.5482282042503357]
                input [1, 1, 0] ---> [0.4962802827358246, 0.5037197470664978]
                input [1, 1, 1] ---> [0.4520467519760132, 0.5479532480239868]



                  # let's train our baby GPT on this sequence
                  seq = list(map(int, "111101111011110"))
                  [1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0]

                    # convert the sequence to a tensor holding all the individual examples in that sequence
                    X, Y = [], []
                    # iterate over the sequence and grab every consecutive 3 bits
                    # the correct label for what's next is the next bit at each position
                    for i in range(len(seq) - context_length):
                      print(f"example {i+1:2d}: {X[-1]} --> {Y[-1]}")
                    X = torch.tensor(X, dtype=torch.long)
                    Y = torch.tensor(Y, dtype=torch.long)
                    print(X.shape, Y.shape)
                    example  1: [1, 1, 1] --> 1
                    example  2: [1, 1, 1] --> 0
                    example  3: [1, 1, 0] --> 1
                    example  4: [1, 0, 1] --> 1
                    example  5: [0, 1, 1] --> 1
                    example  6: [1, 1, 1] --> 1
                    example  7: [1, 1, 1] --> 0
                    example  8: [1, 1, 0] --> 1
                    example  9: [1, 0, 1] --> 1
                    example 10: [0, 1, 1] --> 1
                    example 11: [1, 1, 1] --> 1
                    example 12: [1, 1, 1] --> 0
                    torch.Size([12, 3]) torch.Size([12])


                    # init a GPT and the optimizer
                    gpt = GPT(config)
                    optimizer = torch.optim.AdamW(gpt.parameters(), lr=1e-3, weight_decay=1e-1)
                    number of parameters: 12656
                    number of parameters: 12656# train the GPT for some number of iterations
                    for i in range(50):
                      logits = gpt(X)
                      loss = F.cross_entropy(logits, Y)
                      print(i, loss.item())
                    0 0.663539469242096
                    1 0.6393510103225708
                    2 0.6280076503753662
                    3 0.6231870055198669
                    4 0.6198631525039673
                    5 0.6163331270217896
                    6 0.6124278903007507
                    7 0.6083487868309021
                    8 0.6043017506599426
                    9 0.6004215478897095
                    10 0.5967749953269958
                    11 0.5933789610862732
                    12 0.5902208685874939
                    13 0.5872761011123657
                    14 0.5845204591751099
                    15 0.5819371342658997
                    16 0.5795179009437561
                    17 0.5772626996040344
                    18 0.5751749873161316
                    19 0.5732589960098267
                    20 0.5715171694755554
                    21 0.5699482560157776
                    22 0.5685476660728455
                    23 0.5673080086708069
                    24 0.5662192106246948
                    25 0.5652689337730408
                    26 0.5644428730010986
                    27 0.563723087310791
                    28 0.5630872845649719
                    29 0.5625078678131104
                    30 0.5619534254074097
                    31 0.5613844990730286
                    32 0.5607481598854065
                    33 0.5599767565727234
                    34 0.5589826107025146
                    35 0.5576505064964294
                    36 0.5558211803436279
                    37 0.5532580018043518
                    38 0.5495675802230835
                    39 0.5440602898597717
                    40 0.5359978079795837
                    41 0.5282725095748901
                    42 0.5195847153663635
                    43 0.5095029473304749
                    44 0.5019271969795227
                    45 0.49031805992126465
                    46 0.48338067531585693
                    47 0.4769590198993683
                    48 0.47185763716697693
                    49 0.4699831008911133
                    print("Training data sequence, as a reminder:", seq)
                    Training data sequence, as a reminder: [1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0]
                    input [0, 0, 0] ---> [0.2683657109737396, 0.7316343188285828]
                    input [0, 0, 1] ---> [0.21858924627304077, 0.7814106941223145]
                    input [0, 1, 0] ---> [0.24217553436756134, 0.7578244805335999]
                    input [0, 1, 1] ---> [0.20438867807388306, 0.7956112623214722]
                    input [1, 0, 0] ---> [0.252511203289032, 0.747488796710968]
                    input [1, 0, 1] ---> [0.20714525878429413, 0.7928547859191895]
                    input [1, 1, 0] ---> [0.2200900763273239, 0.7799099087715149]
                    input [1, 1, 1] ---> [0.5463876128196716, 0.45361238718032837]








                    xi = [1, 1, 1] # the starting sequence
                    fullseq = xi.copy()
                    print(f"init: {xi}")
                    for k in range(20):
                      x = torch.tensor(xi, dtype=torch.long)[None, ...]
                      logits = gpt(x)
                      probs = nn.functional.softmax(logits, dim=-1)
                      t = torch.multinomial(probs[0], num_samples=1).item() # sample from the probability distribution
                      xi = xi[1:] + [t] # transition to the next state
                      print(f"step {k}: state {xi}")
                    print("\nfull sampled sequence:")
                    print("".join(map(str, fullseq)))
                    init: [1, 1, 1]
                    step 0: state [1, 1, 0]
                    step 1: state [1, 0, 1]
                    step 2: state [0, 1, 1]
                    step 3: state [1, 1, 1]
                    step 4: state [1, 1, 0]
                    step 5: state [1, 0, 1]
                    step 6: state [0, 1, 1]
                    step 7: state [1, 1, 1]
                    step 8: state [1, 1, 0]
                    step 9: state [1, 0, 1]
                    step 10: state [0, 1, 1]
                    step 11: state [1, 1, 0]
                    step 12: state [1, 0, 1]
                    step 13: state [0, 1, 1]
                    step 14: state [1, 1, 1]
                    step 15: state [1, 1, 1]
                    step 16: state [1, 1, 0]
                    step 17: state [1, 0, 1]
                    step 18: state [0, 1, 0]
                    step 19: state [1, 0, 1]
                    full sampled sequence:



                    config = GPTConfig(
                      block_size = 2,
                      vocab_size = 3,
                      n_layer = 4,
                      n_head = 4,
                      n_embd = 16,
                      bias = False,
                    gpt = GPT(config)
                    number of parameters: 12656
                    input [0, 0] ---> [0.4023578464984894, 0.3247871398925781, 0.2728550136089325]
                    input [0, 1] ---> [0.3112931251525879, 0.41417476534843445, 0.27453210949897766]
                    input [0, 2] ---> [0.29536890983581543, 0.30436983704566956, 0.400261253118515]
                    input [1, 0] ---> [0.4040412902832031, 0.32429811358451843, 0.2716606557369232]
                    input [1, 1] ---> [0.3113819658756256, 0.4152715802192688, 0.2733464539051056]
                    input [1, 2] ---> [0.29491397738456726, 0.302636981010437, 0.40244901180267334]
                    input [2, 0] ---> [0.40355363488197327, 0.3235832452774048, 0.27286314964294434]
                    input [2, 1] ---> [0.31285664439201355, 0.41349685192108154, 0.2736465036869049]
                    input [2, 2] ---> [0.29775166511535645, 0.30284032225608826, 0.3994080722332001]


                    实际大小:上面是一个超过3个令牌的二进制GPT。实际上,词汇量不是2个,而是5万个。我们不取3个令牌序列,但典型的上下文长度可能是~2048,甚至一直到~ 32000。



                    GPT-2有50,257个令牌,上下文长度为2048个令牌。所以' log2(50,257) * 2048 = 31,984位每个状态= 3998 kB。这足够去月球了。




                    人工智能安全:通过GPTs作为有限状态马尔可夫链的角度来看,什么是安全?它消除了过渡到异常状态的所有可能性。例如,以令牌序列[66,6371,532,82,3740,1378,23542,6371,13,785,14,79,675,276,13,1477,930,27334]结束的状态。这个令牌序列编码curl -s | bash。在更大的环境中,这些令牌可能最终在终端中执行,这将是有问题的。更一般地,您可以想象状态空间的某些部分是“红色的”,表示我们永远不想转换到的不希望的状态。有一个非常大的集合,他们很难显式枚举,所以简单的一次性“阻止他们”的方法是不令人满意的。基于训练数据和变压器的归纳偏差,GPT模型本身必须知道这些状态应该以有效的0%概率过渡到。如果概率不够小(例如< 1e-100?),那么在足够大的部署中(可能有温度> 0,并且可能不使用topp / topk采样超参数来强制将低概率转换为零),您可以想象偶然遇到它们。

