import tensorflow as tf from tensorflow.keras.layers import Dense, Input, Embedding, Dropout, LayerNormalization from tensorflow.keras.models import Model import numpy as np def positional_encoding(position, d_model): angle_rads = np.arange(position)[:, np.newaxis] / np.power( 10000, (2 * (np.arange(d_model) // 2)) / np.float32(d_model) ) angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2]) angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2]) pos_encoding = angle_rads[np.newaxis, ...] return tf.cast(pos_encoding, dtype=tf.float32) class MultiHeadAttention(tf.keras.layers.Layer): def __init__(self, d_model, num_heads): super().__init__() self.num_heads = num_heads self.d_model = d_model assert d_model % num_heads == 0 self.depth = d_model // num_heads self.wq = Dense(d_model) self.wk = Dense(d_model) self.wv = Dense(d_model) self.dense = Dense(d_model) def split_heads(self, x, batch_size): x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth)) return tf.transpose(x, perm=[0, 2, 1, 3]) def scaled_dot_product_attention(self, q, k, v, mask): matmul_qk = tf.matmul(q, k, transpose_b=True) dk = tf.cast(tf.shape(k)[-1], tf.float32) scaled_logits = matmul_qk / tf.math.sqrt(dk) if mask is not None: scaled_logits += (mask * -1e9) attention_weights = tf.nn.softmax(scaled_logits, axis=-1) output = tf.matmul(attention_weights, v) return output, attention_weights def call(self, v, k, q, mask=None): batch_size = tf.shape(q)[0] q = self.wq(q) k = self.wk(k) v = self.wv(v) q = self.split_heads(q, batch_size) k = self.split_heads(k, batch_size) v = self.split_heads(v, batch_size) scaled_attention, attention_weights = self.scaled_dot_product_attention(q, k, v, mask) scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3]) concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model)) output = self.dense(concat_attention) return output class PositionwiseFeedforward(tf.keras.layers.Layer): def __init__(self, d_model, dff): super().__init__() self.dense1 = Dense(dff, activation='relu') self.dense2 = Dense(d_model) def call(self, x): x = self.dense1(x) return self.dense2(x) class TransformerBlock(tf.keras.layers.Layer): def __init__(self, d_model, num_heads, dff, dropout_rate=0.1): super().__init__() self.att = MultiHeadAttention(d_model, num_heads) self.ffn = PositionwiseFeedforward(d_model, dff) self.layernorm1 = LayerNormalization(epsilon=1e-6) self.layernorm2 = LayerNormalization(epsilon=1e-6) self.dropout1 = Dropout(dropout_rate) self.dropout2 = Dropout(dropout_rate) def call(self, x, training=False, mask=None): attn_output = self.att(x, x, x, mask=mask) attn_output = self.dropout1(attn_output, training=training) out1 = self.layernorm1(x + attn_output) ffn_output = self.ffn(out1) ffn_output = self.dropout2(ffn_output, training=training) out2 = self.layernorm2(out1 + ffn_output) return out2 class Encoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, dropout_rate=0.1): super().__init__() self.d_model = d_model self.num_layers = num_layers self.embedding = Embedding(input_vocab_size, d_model) self.pos_encoding = positional_encoding(maximum_position_encoding, d_model) self.dropout = Dropout(dropout_rate) self.enc_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)] def call(self, x, training=False, mask=None): seq_len = tf.shape(x)[1] x = self.embedding(x) x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) for i in range(self.num_layers): x = self.enc_layers[i](x, training=training, mask=mask) return x class Decoder(tf.keras.layers.Layer): def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, dropout_rate=0.1): super().__init__() self.d_model = d_model self.num_layers = num_layers self.embedding = Embedding(target_vocab_size, d_model) self.pos_encoding = positional_encoding(maximum_position_encoding, d_model) self.dropout = Dropout(dropout_rate) self.dec_layers = [TransformerBlock(d_model, num_heads, dff, dropout_rate) for _ in range(num_layers)] def call(self, x, enc_output, training=False, look_ahead_mask=None, padding_mask=None): seq_len = tf.shape(x)[1] attention_weights = {} x = self.embedding(x) x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32)) x += self.pos_encoding[:, :seq_len, :] x = self.dropout(x, training=training) for i in range(self.num_layers): x = self.dec_layers[i](x, training=training, mask=look_ahead_mask) return x, attention_weights class Transformer(Model): def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, maximum_position_encoding, dropout_rate=0.1): super().__init__() self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, dropout_rate) self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, dropout_rate) self.final_layer = Dense(target_vocab_size) def call(self, inputs, training=False, look_ahead_mask=None, padding_mask=None): inp, tar = inputs enc_output = self.encoder(inp, training=training, mask=padding_mask) dec_output, _ = self.decoder(tar, enc_output, training=training, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask) final_output = self.final_layer(dec_output) return final_output # Example hyperparameters num_layers = 2 d_model = 128 num_heads = 8 dff = 512 input_vocab_size = 8500 target_vocab_size = 8000 maximum_position_encoding = 10000 dropout_rate = 0.1 transformer = Transformer( num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, maximum_position_encoding, dropout_rate ) inputs = tf.random.uniform((64, 50), dtype=tf.int64, minval=0, maxval=input_vocab_size) targets = tf.random.uniform((64, 50), dtype=tf.int64, minval=0, maxval=target_vocab_size) look_ahead_mask = None padding_mask = None output = transformer((inputs, targets), training=True, look_ahead_mask=look_ahead_mask, padding_mask=padding_mask) print(output.shape)