Untitled
unknown
plain_text
7 months ago
22 kB
8
Indexable
#K MEANS df<-read.csv("C:/Users/agarw/Downloads/custom_dataset.csv") kmeans_result <- kmeans(df[, c(1,2)], centers = 3, nstart = 20) print(kmeans_result$cluster) library(ggplot2) df$Cluster <- as.factor(kmeans_result$cluster) centers <- as.data.frame(kmeans_result$centers) colnames(centers) <- c("x", "y") # Rename to match x and y in df # Plot the clusters ggplot(df, aes(x = x, y = y, color = Cluster)) + geom_point(size = 3) + # Data points geom_point(data = centers, aes(x = x, y = y), # Cluster centers color = "black", shape = 8, size = 4) + labs(title = "K-Means Clustering", x = "Feature 1", y = "Feature 2") + theme_minimal() #iris k means library(ggplot2) data(iris) head(iris) iris_data <- iris[, 1:4] iris_scaled <- scale(iris_data) set.seed(123) kmeans_result <- kmeans(iris_scaled, centers = 3, nstart = 20) iris$Cluster <- as.factor(kmeans_result$cluster) ggplot(iris, aes(x = Sepal.Length, y = Sepal.Width, color = Cluster)) + geom_point(size = 3) + geom_point(data = as.data.frame(kmeans_result$centers), aes(x = Sepal.Length, y = Sepal.Width), color = "black", shape = 8, size = 4) + labs(title = "K-Means Clustering on Iris Dataset", x = "Sepal Length", y = "Sepal Width") + theme_minimal() #pca custom df<-read.csv("C:/Users/agarw/Downloads/custom_dataset.csv") custom_data <- df[, sapply(df, is.numeric)] custom_scaled <- scale(custom_data) pca_custom <- prcomp(custom_scaled, center = TRUE, scale. = TRUE) summary(pca_custom) ggplot(as.data.frame(pca_custom$x), aes(x = PC1, y = PC2)) + geom_point(size = 3) + labs(title = "PCA on Custom Dataset", x = "Principal Component 1", y = "Principal Component 2") + theme_minimal() #pca iris data(iris) iris_data <- iris[, 1:4] iris_scaled <- scale(iris_data) pca_iris <- prcomp(iris_scaled, center = TRUE, scale. = TRUE) summary(pca_iris) library(ggplot2) ggplot(as.data.frame(pca_iris$x), aes(x = PC1, y = PC2, color = iris$Species)) + geom_point(size = 3) + labs(title = "PCA on Iris Dataset", x = "Principal Component 1", y = "Principal Component 2") + theme_minimal() #Hierarchial df <- read.csv("path/to/your/custom_dataset.csv") custom_data <- df[, sapply(df, is.numeric)] custom_scaled <- scale(custom_data) dist_matrix_custom <- dist(custom_scaled) hc_custom <- hclust(dist_matrix_custom, method = "complete") plot(hc_custom, main = "Hierarchical Clustering Dendrogram (Custom Dataset)", xlab = "", sub = "") #hierarchial iris data(iris) iris_data <- iris[, 1:4] iris_scaled <- scale(iris_data) dist_matrix <- dist(iris_scaled) hc_iris <- hclust(dist_matrix, method = "complete") plot(hc_iris, main = "Hierarchical Clustering Dendrogram (Iris)", xlab = "", sub = "") #knn iris library(class) data(iris) iris_data <- iris[, 1:4] iris_labels <- iris$Species set.seed(123) train_index <- sample(1:nrow(iris_data), 0.7 * nrow(iris_data)) train_data <- iris_data[train_index, ] test_data <- iris_data[-train_index, ] train_labels <- iris_labels[train_index] test_labels <- iris_labels[-train_index] knn_predictions <- knn(train_data, test_data, train_labels, k = 3) table(knn_predictions, test_labels) #knn custom library(class) df <- read.csv("path/to/your/custom_dataset.csv") custom_data <- df[, sapply(df, is.numeric)] custom_labels <- df$target_column # Replace with the actual label column name set.seed(123) train_index_custom <- sample(1:nrow(custom_data), 0.7 * nrow(custom_data)) train_data_custom <- custom_data[train_index_custom, ] test_data_custom <- custom_data[-train_index_custom, ] train_labels_custom <- custom_labels[train_index_custom] test_labels_custom <- custom_labels[-train_index_custom] knn_predictions_custom <- knn(train_data_custom, test_data_custom, train_labels_custom, k = 3) table(knn_predictions_custom, test_labels_custom) DATA TRANSFORMATION # Example Dataset data <- data.frame( ID = 1:6, Name = c("Alice", "Bob", "Charlie", "David", "Eva", "Frank"), Age = c(23, 45, 34, NA, 29, 41), Score = c(89, 76, 92, 85, NA, 73), Group = c("A", "B", "A", "B", "A", "B") ) names(data) <- c("ID", "Name", "Age", "Score", "Category") # rename cols data$ID <- as.character(data$ID) data$Category <- as.factor(data$Category) #change data type # Replace NA in Age with mean data$Age[is.na(data$Age)] <- mean(data$Age, na.rm = TRUE) # Replace NA in Score with median data$Score[is.na(data$Score)] <- median(data$Score, na.rm = TRUE) #check for missing vals sum(is.na(df)) # Missing Values by Column colSums(is.na(df)) # Percentage of Missing Values by Column colSums(is.na(df)) / nrow(df) * 100 cleaned_data <- na.omit(data) # remove rows with missing values # Filter rows where Age > 30 filtered_data <- data[data$Age > 30, ] # Using dplyr library(dplyr) filtered_data <- data %>% filter(Age > 30) # Sort by Age sorted_data <- data[order(data$Age), ] # Using dplyr sorted_data <- data %>% arrange(Age) #arrange function # Add a column indicating whether Score is above 80 data$HighScore <- ifelse(data$Score > 80, TRUE, FALSE) # Using dplyr data <- data %>% mutate(HighScore = Score > 80) # Summary of Age by Category summary_data <- data %>% group_by(Category) %>% summarise( Avg_Age = mean(Age, na.rm = TRUE), Max_Score = max(Score, na.rm = TRUE) ) RESHAPE DATA # Make data wider wide_data <- data %>% pivot_wider(names_from = Category, values_from = Score) # Make data longer long_data <- wide_data %>% pivot_longer(cols = starts_with("A"), names_to = "Category", values_to = "Score") # Create another dataset new_data <- data.frame(ID = c("1", "2", "3"), Gender = c("F", "M", "M")) # Merge datasets merged_data <- merge(data, new_data, by = "ID") # Standardize Score column data$Score <- scale(data$Score) # Min-Max Scaling data$Score <- (data$Score - min(data$Score)) / (max(data$Score) - min(data$Score)) #mean normalization data$Score <- (data$Score - mean(data$Score)) / (max(data$Score) - min(data$Score)) #scaling using median and iqr iqr <- IQR(data$Score, na.rm = TRUE) median_val <- median(data$Score, na.rm = TRUE) data$Score <- (data$Score - median_val) / iqr #log transformation data$Score <- log(data$Score + 1) #exp tranformation data$Score <- exp(data$Score) #sqrt transformation data$Score <- sqrt(data$Score) #decimal scaling max_abs <- max(abs(data$Score), na.rm = TRUE) data$Score <- data$Score / (10^floor(log10(max_abs) + 1)) #max abs scaling data$Score <- data$Score / max(abs(data$Score), na.rm = TRUE) #range scaling a <- 0 b <- 1 data$Score <- (data$Score - min(data$Score)) / (max(data$Score) - min(data$Score)) * (b - a) + a data$Category <- recode(data$Category, "A" = "Group_A", "B" = "Group_B") #recode vals data$Category[data$Category == "A"] <- "Alpha" #replace specific vals # bin/discretize data data$AgeGroup <- cut(data$Age, breaks = c(0, 30, 50, Inf), labels = c("Young", "Middle", "Senior")) unique_data <- data[!duplicated(data), ] #eliminate duplicate row data$Age <- sapply(data$Age, function(x) x^2) #apply custom function to col selected_data <- data %>% select(starts_with("A")) #select col matching pattern data <- data %>% select(-Category) #remove unwanted col data <- data %>% select(Name, Age, everything()) #reorder cols transposed_data <- t(data) #transpose data data <- data %>% mutate(RowID = row_number()) #add row identifier #convert wide to long data long_data <- pivot_longer(data, cols = c(Age, Score), names_to = "Variable", values_to = "Value") #convert long to wide data wide_data <- pivot_wider(long_data, names_from = Variable, values_from = Value) data <- data %>% mutate(LaggedScore = lag(Score), LeadScore = lead(Score)) #Lag/lead variables data <- data %>% mutate(CumulativeScore = cumsum(Score)) #cummulative sum #rowwise calc data <- data %>% rowwise() %>% mutate(Age_Score_Sum = sum(Age, Score, na.rm = TRUE)) expanded_data <- expand.grid(Category = c("A", "B"), Score = seq(70, 90, by = 5)) #expand data data <- data %>% separate(Name, into = c("FirstName", "LastName"), sep = " ") #split cols data <- data %>% unite("FullName", c("FirstName", "LastName"), sep = " ") #combine cols data <- data %>% mutate(ScoreRank = rank(-Score)) #rank rows based on cols filtered_data <- data %>% filter(Age > 30 & Score > 80) #filter data #FINDING OUTLIERS # Compute Z-Scores df$z_score <- scale(df$numeric_column) # Identify Outliers outliers <- df[abs(df$z_score) > 3, ] print("Outliers using Z-Score:") print(outliers) # Boxplot with Outliers boxplot(df$numeric_column, main = "Boxplot to Detect Outliers", col = "lightblue") # Density Plot ggplot(df, aes(x = numeric_column)) + geom_density(fill = "lightgreen", alpha = 0.5) + ggtitle("Density Plot to Detect Outliers") # Remove Outliers (Using IQR Method as Example) df_cleaned <- df[df$numeric_column >= lower_bound & df$numeric_column <= upper_bound, ] print("Dataset after Removing Outliers:") print(df_cleaned) # to check for structure str(df) # to check class sapply(df, class) # Data Type of One Column class(df$age) VISUALIZATION #HISTOGRAM library(ggplot2) ggplot(data, aes(x = Score)) + geom_histogram(binwidth = 5, fill = "blue", color = "black") + labs(title = "Histogram of Scores", x = "Score", y = "Frequency") #DENSITY PLOT ggplot(data, aes(x = Score)) + geom_density(fill = "blue", alpha = 0.5) + labs(title = "Density Plot of Scores", x = "Score", y = "Density") #BOXPLOT ggplot(data, aes(x = Category, y = Score)) + geom_boxplot(fill = "lightblue") + labs(title = "Box Plot of Scores by Category", x = "Category", y = "Score") #VIOLIN PLOT ggplot(data, aes(x = Category, y = Score)) + geom_violin(fill = "lightgreen", alpha = 0.7) + labs(title = "Violin Plot of Scores by Category", x = "Category", y = "Score") #BAR PLOT ggplot(data, aes(x = Category)) + geom_bar(fill = "purple") + labs(title = "Bar Plot of Categories", x = "Category", y = "Count") #SCATTER PLOT ggplot(data, aes(x = Age, y = Score)) + geom_point(color = "blue", size = 3) + labs(title = "Scatter Plot of Age vs. Score", x = "Age", y = "Score") #LINE PLOT ggplot(data, aes(x = Age, y = Score, group = 1)) + geom_line(color = "red", size = 1) + geom_point(size = 2) + labs(title = "Line Plot of Age vs. Score", x = "Age", y = "Score") #HEATMAP library(reshape2) cor_data <- cor(data[, c("Age", "Score")], use = "complete.obs") melted_data <- melt(cor_data) ggplot(melted_data, aes(x = Var1, y = Var2, fill = value)) + geom_tile() + scale_fill_gradient2(low = "blue", high = "red", mid = "white", midpoint = 0) + labs(title = "Heatmap of Correlations", x = "", y = "") #FACET PLOT ggplot(data, aes(x = Score)) + geom_histogram(binwidth = 5, fill = "orange", color = "black") + facet_wrap(~ Category) + labs(title = "Histogram of Scores by Category", x = "Score", y = "Frequency") #QQ PLOT ggplot(data, aes(sample = Score)) + stat_qq() + stat_qq_line(color = "red") + labs(title = "QQ Plot of Scores", x = "Theoretical Quantiles", y = "Sample Quantiles") #PAIR PLOT library(GGally) ggpairs(data[, c("Age", "Score")]) + labs(title = "Pair Plot of Age and Score") #DOT PLOT ggplot(data, aes(x = Category, y = Score)) + geom_dotplot(binaxis = 'y', stackdir = 'center', dotsize = 0.8) + labs(title = "Dot Plot of Scores by Category", x = "Category", y = "Score") #AREA PLOT ggplot(data, aes(x = Age, y = Score)) + geom_area(fill = "lightblue", alpha = 0.5) + labs(title = "Area Plot of Age vs. Score", x = "Age", y = "Score") #BUBBLE CHART ggplot(data, aes(x = Age, y = Score, size = Age)) + geom_point(color = "blue", alpha = 0.7) + labs(title = "Bubble Chart of Age vs. Score", x = "Age", y = "Score", size = "Age") #STACKED BAR CHART ggplot(data, aes(x = Category, fill = HighScore)) + geom_bar(position = "stack") + labs(title = "Stacked Bar Chart by HighScore", x = "Category", y = "Count") #RADAR CHART library(fmsb) radar_data <- data.frame( Min = rep(0, 3), Max = rep(100, 3), Score = c(mean(data$Score, na.rm = TRUE), sd(data$Score, na.rm = TRUE), max(data$Score, na.rm = TRUE)) ) rownames(radar_data) <- c("Average", "SD", "Max") radarchart(radar_data) #PIE CHART ggplot(data, aes(x = "", fill = Category)) + geom_bar(width = 1, stat = "count") + coord_polar(theta = "y") + labs(title = "Pie Chart of Categories", x = "", y = "") #SPLINE CHART ggplot(data, aes(x = Age, y = Score)) + geom_smooth(method = "loess", color = "darkgreen") + labs(title = "Spline Chart of Age vs. Score", x = "Age", y = "Score") CORRELATION cor(data$Age, data$Score, method = "pearson") cor(data$Age, data$Score, method = "spearman") cor(data$Age, data$Score, method = "kendall") cor_matrix <- cor(data[, c("Age", "Score", "Height")], use = "complete.obs", method = "pearson") print(cor_matrix) #significance test cor_test <- cor.test(data$Age, data$Score, method = "pearson") print(cor_test) #partial corr library(ppcor) partial_cor <- pcor(data[, c("Age", "Score", "Height")]) print(partial_cor$estimate) #pairwise corr pairwise_cor <- cor(data, use = "pairwise.complete.obs") print(pairwise_cor) #matrix of p vals library(Hmisc) cor_res <- rcorr(as.matrix(data[, c("Age", "Score", "Height")])) cor_res$P #biweight midcorr library(WGCNA) bicor_res <- bicor(data[, c("Age", "Score", "Height")]) print(bicor_res) #distance corr library(energy) dcor_res <- dcor(data$Age, data$Score) print(dcor_res) #polychloric corr library(polycor) poly_cor <- polychor(data$Age, data$Score) print(poly_cor) #phi coefficient library(psych) phi_res <- phi(table(data$BinaryVar1, data$BinaryVar2)) print(phi_res) #covariance calc cov_res <- cov(data$Age, data$Score, use = "complete.obs") print(cov_res) #multivariate corr library(psych) cor2_res <- cor2(data[, c("Age", "Score")], data[, c("Height", "Weight")]) print(cor2_res) UNIVARIATE ANALYSIS # Descriptive Statistics for 1D (Numeric Column) summary_stats <- summary(df$numeric_column) mean_val <- mean(df$numeric_column, na.rm = TRUE) median_val <- median(df$numeric_column, na.rm = TRUE) mode_val <- get_mode(df$numeric_column) variance_val <- var(df$numeric_column, na.rm = TRUE) sd_val <- sd(df$numeric_column, na.rm = TRUE) range_val <- range(df$numeric_column, na.rm = TRUE) IQR_val <- IQR(df$numeric_column, na.rm = TRUE) skewness_val <- skewness(df$numeric_column, na.rm = TRUE) kurtosis_val <- kurtosis(df$numeric_column, na.rm = TRUE) # Quantiles for a Numeric Column quantile(df$numeric_column, probs = c(0.25, 0.5, 0.75), na.rm = TRUE) # Deciles for a Numeric Column quantile(df$numeric_column, probs = seq(0, 1, by = 0.1), na.rm = TRUE) # Calculating 10th, 50th (Median), and 90th Percentile percentiles <- quantile(df$numeric_column, probs = c(0.10, 0.50, 0.90), na.rm = TRUE) BIVARIATE # Linear Regression Model lm_model <- lm(df$numeric_column2 ~ df$numeric_column1) summary(lm_model) # Plot the Regression Line plot(df$numeric_column1, df$numeric_column2, main = "Scatter Plot with Regression Line", xlab = "Numeric Column1", ylab = "Numeric Column2", col = "blue", pch = 16) abline(lm_model, col = "red") # ANOVA to Check for Differences in Means anova_result <- aov(df$numeric_column ~ df$categorical_column) summary(anova_result) # Summary Statistics by Group aggregate(df$numeric_column ~ df$categorical_column, FUN = summary) # Contingency Table for Categorical Variables contingency_table <- table(df$categorical_column1, df$categorical_column2) print(contingency_table) # Chi-Square Test of Independence chi_square_result <- chisq.test(contingency_table) print(chi_square_result) # Mosaic Plot mosaicplot(contingency_table, main = "Mosaic Plot of Categorical Variables", color = TRUE) # Stacked Bar Plot library(ggplot2) ggplot(df, aes(x = categorical_column1, fill = categorical_column2)) + geom_bar(position = "stack") + labs(title = "Stacked Bar Plot of Categorical Variables", x = "Categorical Column1", y = "Count") # Scatter Plot plot(df$numeric_column1, df$numeric_column2) # Correlation cor(df$numeric_column1, df$numeric_column2) # Linear Regression lm_model <- lm(df$numeric_column2 ~ df$numeric_column1) summary(lm_model) # Boxplot boxplot(df$numeric_column ~ df$categorical_column) # ANOVA anova_result <- aov(df$numeric_column ~ df$categorical_column) summary(anova_result) # Contingency Table table(df$categorical_column1, df$categorical_column2) # Chi-Square Test chi_square_result <- chisq.test(contingency_table) summary(chi_square_result) # Scatter Plot with Color plot(df$numeric_column1, df$numeric_column2, col = df$categorical_column, pch = 16, main = "Scatter Plot Colored by Categorical Variable") # Pairwise Scatter Plot for Multiple Variables pairs(df[, c("numeric_column1", "numeric_column2", "numeric_column3")], main = "Pairwise Scatter Plot") # Step 1: Load the in-built AirPassengers dataset data("AirPassengers") # Step 2: Check the structure and data type of AirPassengers str(AirPassengers) class(AirPassengers) # Step 3: Check for missing values in the dataset sum(is.na(AirPassengers)) # Step 4: Check for the starting and ending date of the dataset start(AirPassengers) end(AirPassengers) # Step 5: Check the frequency of the dataset frequency(AirPassengers) # Step 6: Check for the summary of the dataset summary(AirPassengers) # Step 7: Plot the decomposition of the dataset (Trend, Seasonal, Random) decomposed_data <- decompose(AirPassengers) plot(decomposed_data) # Step 8: Plot the dataset plot(AirPassengers, main = "Air Passengers Time Series", ylab = "Number of Passengers", xlab = "Time", col = "blue") # Step 9: Plot the time series of the dataset plot.ts(AirPassengers, main = "Air Passengers Time Series", ylab = "Number of Passengers", col = "red") # Step 10: Draw the regressor line for the dataset (Linear model) lm_model <- lm(AirPassengers ~ time(AirPassengers)) abline(lm_model, col = "green") # Step 11: Print the cycle across the years for the dataset cycle(AirPassengers) # Step 12: Make the dataset stationary (constant mean and variance) # a. Log transformation of the dataset log_air_passengers <- log(AirPassengers) plot(log_air_passengers, main = "Log-transformed Air Passengers", ylab = "Log(Number of Passengers)", col = "purple") # b. Difference the log-transformed data to make it stationary diff_log_air_passengers <- diff(log_air_passengers) plot(diff_log_air_passengers, main = "Differenced Log-transformed Air Passengers", ylab = "Differenced Log(Number of Passengers)", col = "orange") # Step 13: Plot a box plot across months for seasonal effect boxplot(AirPassengers ~ cycle(AirPassengers), main = "Seasonal Effect by Month", xlab = "Month", ylab = "Number of Passengers", col = "lightblue") # ACF (Autocorrelation Function) plot acf(AirPassengers, main = "Autocorrelation Function (ACF)") # PACF (Partial Autocorrelation Function) plot pacf(AirPassengers, main = "Partial Autocorrelation Function (PACF)") # Augmented Dickey-Fuller Test for Stationarity library(tseries) adf_test <- adf.test(AirPassengers) print(adf_test) # STL Decomposition (Seasonal-Trend decomposition using LOESS) stl_decomposition <- stl(AirPassengers, s.window = "periodic") plot(stl_decomposition) # Fit an ARIMA model library(forecast) arima_model <- auto.arima(AirPassengers) summary(arima_model) # Forecast the next 12 months forecast_values <- forecast(arima_model, h = 12) plot(forecast_values) # Fit Exponential Smoothing model ets_model <- ets(AirPassengers) summary(ets_model) # Forecasting with ETS model ets_forecast <- forecast(ets_model, h = 12) plot(ets_forecast) # Rolling Window Cross-validation for time series (Example using ARIMA) library(forecast) train_size <- length(AirPassengers) - 12 # Train on all data except last 12 months # Loop through each fold for (i in train_size:(length(AirPassengers) - 1)) { train_data <- AirPassengers[1:i] test_data <- AirPassengers[(i + 1):(i + 12)] # Fit ARIMA model on training data model <- auto.arima(train_data) # Forecast forecast_values <- forecast(model, h = 12) # Compare forecast with test data accuracy(forecast_values, test_data) } # Simple Moving Average sma <- filter(AirPassengers, rep(1/12, 12), sides = 2) plot(sma, main = "Simple Moving Average of AirPassengers", col = "red", lwd = 2) # Cumulative Moving Average cma <- cumsum(AirPassengers) / seq_along(AirPassengers) plot(cma, main = "Cumulative Moving Average of AirPassengers", col = "green", lwd = 2) # Seasonal Subseries Plot library(forecast) seasonplot(AirPassengers, year.labels = TRUE, main = "Seasonal Subseries Plot") # Seasonal Adjustment seasonally_adjusted <- seasadj(stl_decomposition) plot(seasonally_adjusted, main = "Seasonally Adjusted Air Passengers Data") # Time Series Clustering library(cluster) library(forecast) # Prepare a time series matrix (for clustering multiple series) data_matrix <- matrix(AirPassengers, ncol = 1) # Use k-means clustering on the time series data clusters <- kmeans(data_matrix, centers = 3) plot(clusters$centers, main = "Cluster Centers of Time Series") # Ljung-Box test for autocorrelation in residuals Box.test(residuals(arima_model), lag = 12, type = "Ljung-Box") # Fit Holt-Winters Model holt_winters_model <- HoltWinters(AirPassengers) # Forecast with Holt-Winters Model holt_winters_forecast <- forecast(holt_winters_model, h = 12) plot(holt_winters_forecast) # Define a function to calculate the square of a number square <- function(x) { return(x^2) } # Call the function result <- square(4) print(result) # Output: 16 # Define a function to calculate the area of a rectangle rectangle_area <- function(length, width) { area <- length * width return(area) } # Call the function area <- rectangle_area(5, 10) print(area) # Output: 50 # Define a function to perform addition or subtraction based on the operation calculate <- function(a, b, operation = "add") { if (operation == "add") { return(a + b) } else if (operation == "subtract") { return(a - b) } else { stop("Invalid operation. Use 'add' or 'subtract'.") } } # Call the function with addition result_add <- calculate(5, 3, "add") print(result_add) # Output: 8 # Call the function with subtraction result_subtract <- calculate(5, 3, "subtract") print(result_subtract) # Output: 2
Editor is loading...
Leave a Comment