Untitled

import pandas as pd
import numpy as np
import logging
from semopy import ModelMeans
from semopy.report import report
from semopy.plot import  semplot


# Configure logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S"
)

logger = logging.getLogger(__name__)

class SEMAnalysis:
    def __init__(self, data_path):
        self.data_path = data_path
        self.df = None
        self.model = None
        self.results = None
        self.composite_reliabilities = {}
        logger.info(f"Initializing SEMAnalysis with data path: {data_path}")

        # Define variable groups
        self.attention_items = [
            'SM1A1', 'SM1A2', 'SM1A3', 'SM2A1', 'SM2A2', 'SM2A3', 
            'SM3A1', 'SM3A2', 'SM3A3', 'SM4A1', 'SM4A2', 'SM4A3',
            'BM1A1', 'BM1A2', 'BM1A3', 'BM2A1', 'BM2A2', 'BM2A3', 
            'BM3A1', 'BM3A2', 'BM3A3', 'BM4A1', 'BM4A2', 'BM4A3'
        ]
        self.engagement_items = [
            'SM1E1', 'SM1E2', 'SM1E3', 'SM1E4', 'SM1E5', 'SM2E1', 
            'SM2E2', 'SM2E3', 'SM2E4', 'SM2E5', 'SM3E1', 'SM3E2', 
            'SM3E3', 'SM3E4', 'SM3E5', 'SM4E1', 'SM4E2', 'SM4E3', 
            'SM4E4', 'SM4E5',
            'BM1E1', 'BM1E2', 'BM1E3', 'BM1E4', 'BM2E1', 'BM2E2', 
            'BM2E3', 'BM2E4', 'BM3E1', 'BM3E2', 'BM3E3', 'BM3E4', 
            'BM4E1', 'BM4E2', 'BM4E3', 'BM4E4'
        ]
        self.visibility_items = ['dV1', 'dV2', 'dV3', 'dV4']

    def load_data(self):
        """Load the dataset from the specified path."""
        try:
            logger.info(f"Loading data from: {self.data_path}")
            self.df = pd.read_csv(self.data_path, delimiter=';')
            self.df.columns = self.df.columns.str.replace("_", "", regex=False)
            logger.info(f"Data loaded successfully. Shape: {self.df.shape}")
        except Exception as e:
            logger.error(f"Error loading data: {str(e)}")
            raise

    def preprocess_data(self):
        """Preprocess the dataset."""
        try:
            logger.info("Starting data preprocessing")
            
            # Replace single spaces with NaN
            for col in self.df.columns:
                self.df[col] = self.df[col].replace(' ', np.nan)
            
            # Convert to numeric
            self.df = self.df.apply(pd.to_numeric, errors='coerce')
            
            # Handle -99 as missing value
            self.df = self.df.replace(-99, np.nan)
            
            # Create gender columns
            self.df['Gender_Female'] = (self.df['Gender'] == 1).astype(int)
            self.df['Gender_Male'] = (self.df['Gender'] == 2).astype(int)
            self.df['Gender_NonBinary'] = (self.df['Gender'] == 0).astype(int)
            
            # Create Inoculation and Visibility
            self.df['Inoculation'] = 0
            self.df.loc[(self.df['Flow'] == 1.0) | (self.df['Flow'] == 4.0), 'Inoculation'] = 1
            
            self.df['Visibility'] = 0
            self.df.loc[(self.df['Flow'] == 2.0) | (self.df['Flow'] == 3.0), 'Visibility'] = 1
            
            # Create difference variables
            self.df['dV1'] = self.df['V11'] - self.df['V12']
            self.df['dV2'] = self.df['V21'] - self.df['V22']
            self.df['dV3'] = self.df['V31'] - self.df['V32']
            self.df['dV4'] = self.df['V41'] - self.df['V42']
            
            # Remove low variance variables
            low_variance = self.df.var()[self.df.var() < 1e-5].index
            self.df.drop(columns=low_variance, inplace=True)
            
            # Standardize numeric columns
            numeric_cols = self.df.select_dtypes(include=['float64', 'int64']).columns
            self.df[numeric_cols] = (self.df[numeric_cols] - self.df[numeric_cols].mean()) / self.df[numeric_cols].std()
            
            # Handle missing values
            self.df.fillna(self.df.mean(), inplace=True)
            
            logger.info("Preprocessing completed successfully")
        except Exception as e:
            logger.error(f"Error during preprocessing: {str(e)}")
            raise

    def specify_model(self):
        """Specify the SEM model."""
        logger.info("Specifying SEM model")
        return """
        # Measurement model for Visibility
        Visibility =~ dV1 + dV2 + dV3 + dV4

        # Measurement model for Attention
        Attention =~ SM1A1 + SM1A2 + SM1A3 + SM2A1 + SM2A2 + SM2A3 + SM3A1 + SM3A2 + SM3A3 + SM4A1 + SM4A2 + SM4A3
        Attention =~ BM1A1 + BM1A2 + BM1A3 + BM2A1 + BM2A2 + BM2A3 + BM3A1 + BM3A2 + BM3A3 + BM4A1 + BM4A2 + BM4A3

        # Measurement model for Engagement
        Engagement =~ SM1E1 + SM1E2 + SM1E3 + SM1E4 + SM1E5 + SM2E1 + SM2E2 + SM2E3 + SM2E4 + SM2E5 + SM3E1 + SM3E2 + SM3E3 + SM3E4 + SM3E5 + SM4E1 + SM4E2 + SM4E3 + SM4E4 + SM4E5
        Engagement =~ BM1E1 + BM1E2 + BM1E3 + BM1E4 + BM2E1 + BM2E2 + BM2E3 + BM2E4 + BM3E1 + BM3E2 + BM3E3 + BM3E4 + BM4E1 + BM4E2 + BM4E3 + BM4E4

        # Structural model
        Attention ~ Inoculation + Visibility
        Engagement ~ Inoculation + Visibility
        """

    def fit_model(self):
        """Fit the SEM model."""
        try:
            logger.info("Starting SEM model fitting")
            model_spec = self.specify_model()
            self.model = ModelMeans(model_spec)
            self.model.fit(self.df)
            self.results = self.model.inspect()
            logger.info("Model fitting completed")
        except Exception as e:
            logger.error(f"Error fitting the SEM model: {str(e)}", exc_info=True)
            raise
            
    def plot_model(self, output_file='sem_model.png'):
        """Create and save path diagram of the SEM model."""
        try:
            logger.info("Creating path diagram")
            
            semplot(self.model, output_file)
            report(self.model, "model_results")
            
            logger.info(f"Path diagram saved to {output_file}")
        except Exception as e:
            logger.error(f"Error creating path diagram: {str(e)}", exc_info=True)
            raise

    def calculate_composite_reliability(self, items):
        """Calculate the composite reliability for a set of items."""
        try:
            logger.info(f"Calculating composite reliability for {len(items)} items")
            loadings = self.results.loc[self.results['lval'].isin(items), 'Estimate'].values
            loadings = np.array(loadings, dtype=float)
            loadings = loadings[~np.isnan(loadings)]
            reliability = np.sum(loadings)**2 / (np.sum(loadings)**2 + np.sum(1 - loadings**2))
            return reliability
        except Exception as e:
            logger.error(f"Error calculating composite reliability: {str(e)}")
            return None

    def run(self):
        """Run the entire SEM analysis pipeline."""
        logger.info("Starting SEM analysis pipeline")
        
        try:
            self.load_data()
            self.preprocess_data()
            self.fit_model()
            
            # Calculate composite reliabilities
            constructs = {
                "Attention": self.attention_items,
                "Engagement": self.engagement_items,
                "Visibility": self.visibility_items
            }
            
            for construct, items in constructs.items():
                logger.info(f"Processing reliability for construct: {construct}")
                reliability = self.calculate_composite_reliability(items)
                self.composite_reliabilities[construct] = reliability
            
            # Create model visualization
            self.plot_model()
            
            # Print results
            print("\nModel Results:")
            print("=" * 80)
            print(report(self.model, "model_results"))
            
            print("\nComposite Reliabilities:")
            print("-" * 40)
            for construct, rel in self.composite_reliabilities.items():
                print(f"{construct:<20} = {rel:>10.4f}")
            
            logger.info("Analysis pipeline completed successfully")
            
        except Exception as e:
            logger.error("Error in analysis pipeline", exc_info=True)
            raise

if __name__ == "__main__":
    logger.info("Starting SEM Analysis script")
    
    try:
        data_path = './ThesisData6.csv'
        logger.info(f"Using data path: {data_path}")
        
        sem_analysis = SEMAnalysis(data_path)
        sem_analysis.run()
        
        logger.info("Script completed successfully")
    except Exception as e:
        logger.error("Script failed with error", exc_info=True)
        raise
Editor is loading...