Răsfoiți Sursa

fix chapter3 positionEncode

jjyaoao 7 luni în urmă
părinte
comite
cf968e1a7c

+ 6 - 6
docs/chapter3/Chapter3-Fundamentals-of-Large-Language-Models.md

@@ -453,19 +453,19 @@ class PositionalEncoding(nn.Module):
         div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
 
         # pe (positional encoding) size is (max_len, d_model)
-        pe = torch.zeros(1, max_len, d_model)
+        pe = torch.zeros(max_len, d_model)
 
         # Even dimensions use sin, odd dimensions use cos
-        pe[:, 0, 0::2] = torch.sin(position * div_term)
-        pe[:, 0, 1::2] = torch.cos(position * div_term)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
 
         # Register pe as buffer, so it won't be treated as model parameter but will move with the model (e.g., to(device))
-        self.register_buffer('pe', pe)
+        self.register_buffer('pe', pe.unsqueeze(0))
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x.size(0) is the current input sequence length
+        # x.size(1) is the current input sequence length
         # Add positional encoding to input vector
-        x = x + self.pe[:x.size(0)]
+        x = x + self.pe[:, :x.size(1)]
         return self.dropout(x)
 ```
 

+ 6 - 6
docs/chapter3/第三章 大语言模型基础.md

@@ -455,19 +455,19 @@ class PositionalEncoding(nn.Module):
         div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(10000.0) / d_model))
 
         # pe (positional encoding) 的大小为 (max_len, d_model)
-        pe = torch.zeros(1, max_len, d_model)
+        pe = torch.zeros(max_len, d_model)
 
         # 偶数维度使用 sin, 奇数维度使用 cos
-        pe[:, 0, 0::2] = torch.sin(position * div_term)
-        pe[:, 0, 1::2] = torch.cos(position * div_term)
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
 
         # 将 pe 注册为 buffer,这样它就不会被视为模型参数,但会随模型移动(例如 to(device))
-        self.register_buffer('pe', pe)
+        self.register_buffer('pe', pe.unsqueeze(0))
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
-        # x.size(0) 是当前输入的序列长度
+        # x.size(1) 是当前输入的序列长度
         # 将位置编码加到输入向量上
-        x = x + self.pe[:x.size(0)]
+        x = x + self.pe[:, :x.size(1)]
         return self.dropout(x)
 ```