linucb

class ControllerLayer:
    def __init__(self):
        #Linucb
        self.alpha_values = np.linspace(0.2, 1.0, 0.5) 
        self.beta_values = np.linspace(0.2, 1.0, 0.5)
        self.arm_count = len(self.alpha_values) * len(self.beta_values)
        
        #LinUCB params
        self.alpha_linucb = 0.5  # Exploration parameter
        self.d = 64  # Feature dimension -- pick whatever is used at the tim.e
        
        #Initialize matrices for each arm
        self.A = [np.identity(self.d) for _ in range(self.arm_count)]
        self.b = [np.zeros((self.d, 1)) for _ in range(self.arm_count)]
        
        #Feature extractor
        self.feature_extractor = MLPFeatureExtractor(
            input_dim=128,
            hidden_dims=[128, 64],
            output_dim=self.d,
            dropout=0.3
        )
    
    def get_weights(self, state_features):
        """Return alpha, beta weights based on current state"""
        # Extract context features
        x = self.feature_extractor(state_features)
        
        # Compute UCB scores for each arm
        ucb_scores = []
        for arm in range(self.arm_count):
            A_inv = np.linalg.inv(self.A[arm])
            theta = A_inv.dot(self.b[arm])
            ucb = theta.T.dot(x) + self.alpha_linucb * np.sqrt(x.T.dot(A_inv).dot(x))
            ucb_scores.append(ucb)
        
        #select best arm\
        arm = np.argmax(ucb_scores)
        
        #convert arm index to alpha, beta values
        alpha_idx = arm // len(self.beta_values)
        beta_idx = arm % len(self.beta_values)
        
        return self.alpha_values[alpha_idx], self.beta_values[beta_idx]
    
    def update(self, state_features, selected_arm, reward):
        """Update model based on observed reward"""
        x = self.feature_extractor(state_features)
        
        ##Update matrices for the selected arm
        self.A[selected_arm] += x.dot(x.T)
        self.b[selected_arm] += reward * x
Editor is loading...