Untitled
unknown
r
2 years ago
18 kB
5
Indexable
> > setwd("~/STAT 277") > > # Import the data from "amesOneFam.csv" > ames_data <- read.csv("amesOneFam.csv") > > # Create a subset of only quantitative values > quantitative_subset <- ames_data[sapply(ames_data, is.numeric)] > attributes(quantitative_subset) $names [1] "X" "Lot_Frontage" "Lot_Area" "Year_Built" "Year_Remod_Add" [6] "Mas_Vnr_Area" "BsmtFin_SF_1" "BsmtFin_SF_2" "Bsmt_Unf_SF" "Total_Bsmt_SF" [11] "First_Flr_SF" "Second_Flr_SF" "Gr_Liv_Area" "Bsmt_Full_Bath" "Bsmt_Half_Bath" [16] "Full_Bath" "Half_Bath" "Bedroom_AbvGr" "Kitchen_AbvGr" "TotRms_AbvGrd" [21] "Fireplaces" "Garage_Cars" "Garage_Area" "Wood_Deck_SF" "Open_Porch_SF" [26] "Enclosed_Porch" "Three_season_porch" "Screen_Porch" "Pool_Area" "Misc_Val" [31] "Mo_Sold" "Year_Sold" "Sale_Price" "Longitude" "Latitude" $row.names [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 [22] 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 [43] 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 [64] 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 [85] 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 [106] 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 [127] 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 [148] 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 [169] 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 [190] 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 [211] 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 [232] 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 [253] 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 [274] 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 [295] 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 [316] 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 [337] 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 [358] 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 [379] 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 [400] 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 [421] 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 [442] 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 [463] 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 [484] 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 [505] 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 [526] 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 [547] 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 [568] 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 [589] 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 [610] 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 [631] 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 [652] 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 [673] 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 [694] 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 [715] 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 [736] 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 [757] 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 [778] 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 [799] 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 [820] 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 [841] 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 [862] 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 [883] 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 [904] 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 [925] 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 [946] 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 [967] 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 [988] 988 989 990 991 992 993 994 995 996 997 998 999 1000 [ reached getOption("max.print") -- omitted 1002 entries ] $class [1] "data.frame" > summary(quantitative_subset) X Lot_Frontage Lot_Area Year_Built Year_Remod_Add Mas_Vnr_Area Min. : 1.0 Min. : 0.00 Min. : 2500 Min. :1872 Min. :1950 Min. : 0.00 1st Qu.: 709.5 1st Qu.: 50.00 1st Qu.: 8127 1st Qu.:1950 1st Qu.:1963 1st Qu.: 0.00 Median :1394.5 Median : 65.00 Median : 9750 Median :1968 Median :1992 Median : 0.00 Mean :1423.6 Mean : 58.85 Mean : 10832 Mean :1968 Mean :1983 Mean : 93.56 3rd Qu.:2120.8 3rd Qu.: 80.00 3rd Qu.: 11795 3rd Qu.:1996 3rd Qu.:2002 3rd Qu.: 143.00 Max. :2930.0 Max. :313.00 Max. :215245 Max. :2010 Max. :2010 Max. :1600.00 BsmtFin_SF_1 BsmtFin_SF_2 Bsmt_Unf_SF Total_Bsmt_SF First_Flr_SF Second_Flr_SF Min. :1.000 Min. : 0.00 Min. : 0.0 Min. : 0.0 Min. : 334.0 Min. : 0.0 1st Qu.:2.000 1st Qu.: 0.00 1st Qu.: 226.0 1st Qu.: 801.2 1st Qu.: 882.2 1st Qu.: 0.0 Median :3.000 Median : 0.00 Median : 458.5 Median : 974.0 Median :1062.5 Median : 0.0 Mean :4.124 Mean : 57.81 Mean : 534.1 Mean :1031.3 Mean :1145.0 Mean : 344.5 3rd Qu.:7.000 3rd Qu.: 0.00 3rd Qu.: 780.0 3rd Qu.:1228.0 3rd Qu.:1344.0 3rd Qu.: 723.8 Max. :7.000 Max. :1526.00 Max. :2336.0 Max. :3206.0 Max. :3820.0 Max. :1872.0 Gr_Liv_Area Bsmt_Full_Bath Bsmt_Half_Bath Full_Bath Half_Bath Bedroom_AbvGr Min. : 334 Min. :0.0000 Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.000 1st Qu.:1111 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:3.000 Median :1445 Median :0.0000 Median :0.00000 Median :1.000 Median :0.0000 Median :3.000 Mean :1494 Mean :0.4181 Mean :0.06693 Mean :1.512 Mean :0.3796 Mean :2.915 3rd Qu.:1762 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.0000 3rd Qu.:3.000 Max. :4316 Max. :2.0000 Max. :2.00000 Max. :3.000 Max. :2.0000 Max. :5.000 Kitchen_AbvGr TotRms_AbvGrd Fireplaces Garage_Cars Garage_Area Wood_Deck_SF Open_Porch_SF Min. :0.000 Min. : 2.000 Min. :0.0000 Min. :0.00 Min. : 0 Min. : 0.00 Min. : 0.0 1st Qu.:1.000 1st Qu.: 5.000 1st Qu.:0.0000 1st Qu.:1.00 1st Qu.: 312 1st Qu.: 0.00 1st Qu.: 0.0 Median :1.000 Median : 6.000 Median :1.0000 Median :2.00 Median : 472 Median : 0.00 Median : 25.0 Mean :1.002 Mean : 6.437 Mean :0.6344 Mean :1.74 Mean : 468 Mean : 99.24 Mean : 46.2 3rd Qu.:1.000 3rd Qu.: 7.000 3rd Qu.:1.0000 3rd Qu.:2.00 3rd Qu.: 576 3rd Qu.: 180.00 3rd Qu.: 72.0 Max. :3.000 Max. :12.000 Max. :4.0000 Max. :5.00 Max. :1488 Max. :1424.00 Max. :570.0 Enclosed_Porch Three_season_porch Screen_Porch Pool_Area Misc_Val Mo_Sold Min. : 0.00 Min. : 0.000 Min. : 0.00 Min. : 0.000 Min. : 0.0 Min. : 1.000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.000 1st Qu.: 0.0 1st Qu.: 4.000 Median : 0.00 Median : 0.000 Median : 0.00 Median : 0.000 Median : 0.0 Median : 6.000 Mean : 25.78 Mean : 2.864 Mean : 17.76 Mean : 2.142 Mean : 54.2 Mean : 6.109 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.00 3rd Qu.: 0.000 3rd Qu.: 0.0 3rd Qu.: 7.000 Max. :1012.00 Max. :508.000 Max. :576.00 Max. :800.000 Max. :15500.0 Max. :12.000 Year_Sold Sale_Price Longitude Latitude Min. :2006 Min. : 35000 Min. :-93.69 Min. :41.99 1st Qu.:2007 1st Qu.:130063 1st Qu.:-93.66 1st Qu.:42.02 Median :2008 Median :161875 Median :-93.64 Median :42.03 Mean :2008 Mean :179185 Mean :-93.64 Mean :42.03 3rd Qu.:2009 3rd Qu.:212450 3rd Qu.:-93.62 3rd Qu.:42.05 Max. :2010 Max. :755000 Max. :-93.58 Max. :42.06 > Continious = subset(quantitative_subset, + select = c(Lot_Frontage, Lot_Area, Mas_Vnr_Area, BsmtFin_SF_1, BsmtFin_SF_2, + Bsmt_Unf_SF, Total_Bsmt_SF, First_Flr_SF, Second_Flr_SF, + Gr_Liv_Area, Garage_Area, Wood_Deck_SF, Open_Porch_SF, Enclosed_Porch, + Three_season_porch, Screen_Porch, Pool_Area, Misc_Val, Sale_Price)) > # There is probably an easier way to make R partition out the continuous varia- > # bles, but I am not familiar enough with R yet to figure it out. So I just did > # it manually for this assingment. > > # Create training and testing datasets > set.seed(123) > atrain <- sample(2002,1001) > train <- Continious[atrain,] > test <- Continious[-atrain,] > cor <- cor(na.omit(train[,-1])) > > > #Correlation Plot > library(corrplot) > library(RColorBrewer) > library("PerformanceAnalytics") > corrplot(cor, type="upper", is.corr = FALSE,addCoef.col = 'black', + cl.pos = 'b',method = 'color',col=brewer.pal(n=8, name="RdYlBu"), + tl.cex = 0.5,number.cex = 0.4) > > > #PCA Analysis > pca <- prcomp(train[,-c(19)], center = TRUE, scale = TRUE) > attributes(pca) $names [1] "sdev" "rotation" "center" "scale" "x" $class [1] "prcomp" > pca$sdev^2 #eigen values [1] 3.490177708 1.843529577 1.573810225 1.140692391 1.122624662 1.009961720 1.008267830 0.969453953 0.955529702 [10] 0.900697851 0.853754493 0.797304453 0.684892541 0.583610527 0.561728105 0.335392651 0.164863925 0.003707686 > sum(pca$sdev^2) #should equal the number of variables [1] 18 > var_explained <- (pca$sdev^2)/18#Percent variability explained by each variable > var_explained [1] 0.1938987616 0.1024183098 0.0874339014 0.0633717995 0.0623680368 0.0561089844 0.0560148795 0.0538585530 [9] 0.0530849834 0.0500387695 0.0474308052 0.0442946918 0.0380495856 0.0324228071 0.0312071169 0.0186329251 [17] 0.0091591069 0.0002059826 > > #Scree Plots: > library(ggplot2) > qplot(c(1:18), pca$sdev^2) + + geom_line() + + xlab("Principal Component") + + ylab("Eigen Values") + + ggtitle("Scree Plot") > qplot(c(1:18), var_explained) + + geom_line() + + xlab("Principal Component") + + ylab("Variance Explained") + + ggtitle("Scree Plot") > > #The first 5 PCs have eigenvalues significantly above 1 so we will include them! > > #Paired Panel Plots > library(psych) > pairs.panels(pca$x[,1:5], gap=0,pch=2) > > #Create Biplots > library(devtools) > library(ggbiplot) > ggbiplot(pca,choices = c(1,2),alpha=0) > ggbiplot(pca,choices=c(1,3),alpha=0) > ggbiplot(pca,choices=c(1,4),alpha=0) > ggbiplot(pca,choices = c(1,5),alpha=0) > > #Give the Sales_Price Column back to the DataSets > trg <- predict(pca, train) > trg <- data.frame(train[c(19)],trg) > tst <- predict(pca, test) > tst <- data.frame(test[c(19)],tst) > > #5 PC Model > trainPC5 <- lm(Sale_Price ~ PC1 + PC2 + PC3 + PC4 + PC5, data=trg) > summary(trainPC5) Call: lm(formula = Sale_Price ~ PC1 + PC2 + PC3 + PC4 + PC5, data = trg) Residuals: Min 1Q Median 3Q Max -129641 -17099 976 17421 233412 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 176943.7 1015.3 174.272 < 2e-16 *** PC1 32943.7 543.8 60.586 < 2e-16 *** PC2 4824.4 748.2 6.448 1.76e-10 *** PC3 7562.9 809.7 9.340 < 2e-16 *** PC4 -524.9 951.1 -0.552 0.5811 PC5 1987.3 958.8 2.073 0.0385 * --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 32120 on 995 degrees of freedom Multiple R-squared: 0.7927, Adjusted R-squared: 0.7916 F-statistic: 760.8 on 5 and 995 DF, p-value: < 2.2e-16 > > #Full Model > trainfullPC <- lm(Sale_Price ~ ., data=trg) > summary(trainfullPC) Call: lm(formula = Sale_Price ~ ., data = trg) Residuals: Min 1Q Median 3Q Max -115438 -15970 987 17015 188185 Coefficients: Estimate Std. Error t value Pr(>|t|) (Intercept) 176943.7 928.7 190.536 < 2e-16 *** PC1 32943.7 497.3 66.240 < 2e-16 *** PC2 4824.4 684.3 7.050 3.37e-12 *** PC3 7562.9 740.6 10.211 < 2e-16 *** PC4 -524.9 869.9 -0.603 0.546364 PC5 1987.3 876.9 2.266 0.023656 * PC6 531.2 924.5 0.575 0.565693 PC7 736.2 925.3 0.796 0.426424 PC8 3538.2 943.7 3.749 0.000188 *** PC9 -702.3 950.5 -0.739 0.460172 PC10 -5070.1 979.0 -5.179 2.71e-07 *** PC11 -207.8 1005.6 -0.207 0.836336 PC12 -2244.5 1040.5 -2.157 0.031244 * PC13 -3115.9 1122.7 -2.775 0.005619 ** PC14 1986.7 1216.2 1.633 0.102685 PC15 -8040.7 1239.7 -6.486 1.39e-10 *** PC16 8138.2 1604.3 5.073 4.69e-07 *** PC17 -20659.9 2288.3 -9.029 < 2e-16 *** PC18 -11957.4 15258.9 -0.784 0.433445 --- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 Residual standard error: 29380 on 982 degrees of freedom Multiple R-squared: 0.8288, Adjusted R-squared: 0.8257 F-statistic: 264.1 on 18 and 982 DF, p-value: < 2.2e-16 > > #Predictive Model Testing: > library(tidyverse) > library(MASS) > library(leaps) > library(caret) > predsmain <- predict(trainfullPC, tst) > full.model <- data.frame(R2=R2(predsmain,tst $Sale_Price), + RMSE=RMSE(predsmain,tst $Sale_Price), + MAE=MAE(predsmain,tst $Sale_Price)) > > predsmain <- predict(trainPC5, tst) > five.model <- data.frame(R2=R2(predsmain,tst $Sale_Price), + RMSE=RMSE(predsmain,tst $Sale_Price), + MAE=MAE(predsmain,tst $Sale_Price)) > full.model R2 RMSE MAE 1 0.8046324 33686.5 23995.07 > five.model R2 RMSE MAE 1 0.7722384 36352.27 25749.46 > > #Compare the two Models: > complete <- rbind(full.model,five.model) > row.names(complete) <- c("Model with 18 PCs", "Model with 5 PCs") > complete R2 RMSE MAE Model with 18 PCs 0.8046324 33686.50 23995.07 Model with 5 PCs 0.7722384 36352.27 25749.46 > > #Summary: > # This PCA revealed that 77% of the variance in Sales Price could be explained > # by 5 principal components. As opposed to 80% of the variance in Sales_Price > # being explained by all 18 principal componenets. This gives reason to suggest > # that a 5 component analysis is sufficiently predictive when compared to a more > # fully inclusive model. >
Editor is loading...
Leave a Comment