Untitled

mail@pastecode.io avatar
unknown
r
7 months ago
18 kB
1
Indexable
Never
> 
> setwd("~/STAT 277")
> 
> # Import the data from "amesOneFam.csv"
> ames_data <- read.csv("amesOneFam.csv")
> 
> # Create a subset of only quantitative values
> quantitative_subset <- ames_data[sapply(ames_data, is.numeric)]
> attributes(quantitative_subset)
$names
 [1] "X"                  "Lot_Frontage"       "Lot_Area"           "Year_Built"         "Year_Remod_Add"    
 [6] "Mas_Vnr_Area"       "BsmtFin_SF_1"       "BsmtFin_SF_2"       "Bsmt_Unf_SF"        "Total_Bsmt_SF"     
[11] "First_Flr_SF"       "Second_Flr_SF"      "Gr_Liv_Area"        "Bsmt_Full_Bath"     "Bsmt_Half_Bath"    
[16] "Full_Bath"          "Half_Bath"          "Bedroom_AbvGr"      "Kitchen_AbvGr"      "TotRms_AbvGrd"     
[21] "Fireplaces"         "Garage_Cars"        "Garage_Area"        "Wood_Deck_SF"       "Open_Porch_SF"     
[26] "Enclosed_Porch"     "Three_season_porch" "Screen_Porch"       "Pool_Area"          "Misc_Val"          
[31] "Mo_Sold"            "Year_Sold"          "Sale_Price"         "Longitude"          "Latitude"          

$row.names
   [1]    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16   17   18   19   20   21
  [22]   22   23   24   25   26   27   28   29   30   31   32   33   34   35   36   37   38   39   40   41   42
  [43]   43   44   45   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62   63
  [64]   64   65   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80   81   82   83   84
  [85]   85   86   87   88   89   90   91   92   93   94   95   96   97   98   99  100  101  102  103  104  105
 [106]  106  107  108  109  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124  125  126
 [127]  127  128  129  130  131  132  133  134  135  136  137  138  139  140  141  142  143  144  145  146  147
 [148]  148  149  150  151  152  153  154  155  156  157  158  159  160  161  162  163  164  165  166  167  168
 [169]  169  170  171  172  173  174  175  176  177  178  179  180  181  182  183  184  185  186  187  188  189
 [190]  190  191  192  193  194  195  196  197  198  199  200  201  202  203  204  205  206  207  208  209  210
 [211]  211  212  213  214  215  216  217  218  219  220  221  222  223  224  225  226  227  228  229  230  231
 [232]  232  233  234  235  236  237  238  239  240  241  242  243  244  245  246  247  248  249  250  251  252
 [253]  253  254  255  256  257  258  259  260  261  262  263  264  265  266  267  268  269  270  271  272  273
 [274]  274  275  276  277  278  279  280  281  282  283  284  285  286  287  288  289  290  291  292  293  294
 [295]  295  296  297  298  299  300  301  302  303  304  305  306  307  308  309  310  311  312  313  314  315
 [316]  316  317  318  319  320  321  322  323  324  325  326  327  328  329  330  331  332  333  334  335  336
 [337]  337  338  339  340  341  342  343  344  345  346  347  348  349  350  351  352  353  354  355  356  357
 [358]  358  359  360  361  362  363  364  365  366  367  368  369  370  371  372  373  374  375  376  377  378
 [379]  379  380  381  382  383  384  385  386  387  388  389  390  391  392  393  394  395  396  397  398  399
 [400]  400  401  402  403  404  405  406  407  408  409  410  411  412  413  414  415  416  417  418  419  420
 [421]  421  422  423  424  425  426  427  428  429  430  431  432  433  434  435  436  437  438  439  440  441
 [442]  442  443  444  445  446  447  448  449  450  451  452  453  454  455  456  457  458  459  460  461  462
 [463]  463  464  465  466  467  468  469  470  471  472  473  474  475  476  477  478  479  480  481  482  483
 [484]  484  485  486  487  488  489  490  491  492  493  494  495  496  497  498  499  500  501  502  503  504
 [505]  505  506  507  508  509  510  511  512  513  514  515  516  517  518  519  520  521  522  523  524  525
 [526]  526  527  528  529  530  531  532  533  534  535  536  537  538  539  540  541  542  543  544  545  546
 [547]  547  548  549  550  551  552  553  554  555  556  557  558  559  560  561  562  563  564  565  566  567
 [568]  568  569  570  571  572  573  574  575  576  577  578  579  580  581  582  583  584  585  586  587  588
 [589]  589  590  591  592  593  594  595  596  597  598  599  600  601  602  603  604  605  606  607  608  609
 [610]  610  611  612  613  614  615  616  617  618  619  620  621  622  623  624  625  626  627  628  629  630
 [631]  631  632  633  634  635  636  637  638  639  640  641  642  643  644  645  646  647  648  649  650  651
 [652]  652  653  654  655  656  657  658  659  660  661  662  663  664  665  666  667  668  669  670  671  672
 [673]  673  674  675  676  677  678  679  680  681  682  683  684  685  686  687  688  689  690  691  692  693
 [694]  694  695  696  697  698  699  700  701  702  703  704  705  706  707  708  709  710  711  712  713  714
 [715]  715  716  717  718  719  720  721  722  723  724  725  726  727  728  729  730  731  732  733  734  735
 [736]  736  737  738  739  740  741  742  743  744  745  746  747  748  749  750  751  752  753  754  755  756
 [757]  757  758  759  760  761  762  763  764  765  766  767  768  769  770  771  772  773  774  775  776  777
 [778]  778  779  780  781  782  783  784  785  786  787  788  789  790  791  792  793  794  795  796  797  798
 [799]  799  800  801  802  803  804  805  806  807  808  809  810  811  812  813  814  815  816  817  818  819
 [820]  820  821  822  823  824  825  826  827  828  829  830  831  832  833  834  835  836  837  838  839  840
 [841]  841  842  843  844  845  846  847  848  849  850  851  852  853  854  855  856  857  858  859  860  861
 [862]  862  863  864  865  866  867  868  869  870  871  872  873  874  875  876  877  878  879  880  881  882
 [883]  883  884  885  886  887  888  889  890  891  892  893  894  895  896  897  898  899  900  901  902  903
 [904]  904  905  906  907  908  909  910  911  912  913  914  915  916  917  918  919  920  921  922  923  924
 [925]  925  926  927  928  929  930  931  932  933  934  935  936  937  938  939  940  941  942  943  944  945
 [946]  946  947  948  949  950  951  952  953  954  955  956  957  958  959  960  961  962  963  964  965  966
 [967]  967  968  969  970  971  972  973  974  975  976  977  978  979  980  981  982  983  984  985  986  987
 [988]  988  989  990  991  992  993  994  995  996  997  998  999 1000
 [ reached getOption("max.print") -- omitted 1002 entries ]

$class
[1] "data.frame"

> summary(quantitative_subset)
       X           Lot_Frontage       Lot_Area        Year_Built   Year_Remod_Add  Mas_Vnr_Area    
 Min.   :   1.0   Min.   :  0.00   Min.   :  2500   Min.   :1872   Min.   :1950   Min.   :   0.00  
 1st Qu.: 709.5   1st Qu.: 50.00   1st Qu.:  8127   1st Qu.:1950   1st Qu.:1963   1st Qu.:   0.00  
 Median :1394.5   Median : 65.00   Median :  9750   Median :1968   Median :1992   Median :   0.00  
 Mean   :1423.6   Mean   : 58.85   Mean   : 10832   Mean   :1968   Mean   :1983   Mean   :  93.56  
 3rd Qu.:2120.8   3rd Qu.: 80.00   3rd Qu.: 11795   3rd Qu.:1996   3rd Qu.:2002   3rd Qu.: 143.00  
 Max.   :2930.0   Max.   :313.00   Max.   :215245   Max.   :2010   Max.   :2010   Max.   :1600.00  
  BsmtFin_SF_1    BsmtFin_SF_2      Bsmt_Unf_SF     Total_Bsmt_SF     First_Flr_SF    Second_Flr_SF   
 Min.   :1.000   Min.   :   0.00   Min.   :   0.0   Min.   :   0.0   Min.   : 334.0   Min.   :   0.0  
 1st Qu.:2.000   1st Qu.:   0.00   1st Qu.: 226.0   1st Qu.: 801.2   1st Qu.: 882.2   1st Qu.:   0.0  
 Median :3.000   Median :   0.00   Median : 458.5   Median : 974.0   Median :1062.5   Median :   0.0  
 Mean   :4.124   Mean   :  57.81   Mean   : 534.1   Mean   :1031.3   Mean   :1145.0   Mean   : 344.5  
 3rd Qu.:7.000   3rd Qu.:   0.00   3rd Qu.: 780.0   3rd Qu.:1228.0   3rd Qu.:1344.0   3rd Qu.: 723.8  
 Max.   :7.000   Max.   :1526.00   Max.   :2336.0   Max.   :3206.0   Max.   :3820.0   Max.   :1872.0  
  Gr_Liv_Area   Bsmt_Full_Bath   Bsmt_Half_Bath      Full_Bath       Half_Bath      Bedroom_AbvGr  
 Min.   : 334   Min.   :0.0000   Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.000  
 1st Qu.:1111   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:3.000  
 Median :1445   Median :0.0000   Median :0.00000   Median :1.000   Median :0.0000   Median :3.000  
 Mean   :1494   Mean   :0.4181   Mean   :0.06693   Mean   :1.512   Mean   :0.3796   Mean   :2.915  
 3rd Qu.:1762   3rd Qu.:1.0000   3rd Qu.:0.00000   3rd Qu.:2.000   3rd Qu.:1.0000   3rd Qu.:3.000  
 Max.   :4316   Max.   :2.0000   Max.   :2.00000   Max.   :3.000   Max.   :2.0000   Max.   :5.000  
 Kitchen_AbvGr   TotRms_AbvGrd      Fireplaces      Garage_Cars    Garage_Area    Wood_Deck_SF     Open_Porch_SF  
 Min.   :0.000   Min.   : 2.000   Min.   :0.0000   Min.   :0.00   Min.   :   0   Min.   :   0.00   Min.   :  0.0  
 1st Qu.:1.000   1st Qu.: 5.000   1st Qu.:0.0000   1st Qu.:1.00   1st Qu.: 312   1st Qu.:   0.00   1st Qu.:  0.0  
 Median :1.000   Median : 6.000   Median :1.0000   Median :2.00   Median : 472   Median :   0.00   Median : 25.0  
 Mean   :1.002   Mean   : 6.437   Mean   :0.6344   Mean   :1.74   Mean   : 468   Mean   :  99.24   Mean   : 46.2  
 3rd Qu.:1.000   3rd Qu.: 7.000   3rd Qu.:1.0000   3rd Qu.:2.00   3rd Qu.: 576   3rd Qu.: 180.00   3rd Qu.: 72.0  
 Max.   :3.000   Max.   :12.000   Max.   :4.0000   Max.   :5.00   Max.   :1488   Max.   :1424.00   Max.   :570.0  
 Enclosed_Porch    Three_season_porch  Screen_Porch      Pool_Area          Misc_Val          Mo_Sold      
 Min.   :   0.00   Min.   :  0.000    Min.   :  0.00   Min.   :  0.000   Min.   :    0.0   Min.   : 1.000  
 1st Qu.:   0.00   1st Qu.:  0.000    1st Qu.:  0.00   1st Qu.:  0.000   1st Qu.:    0.0   1st Qu.: 4.000  
 Median :   0.00   Median :  0.000    Median :  0.00   Median :  0.000   Median :    0.0   Median : 6.000  
 Mean   :  25.78   Mean   :  2.864    Mean   : 17.76   Mean   :  2.142   Mean   :   54.2   Mean   : 6.109  
 3rd Qu.:   0.00   3rd Qu.:  0.000    3rd Qu.:  0.00   3rd Qu.:  0.000   3rd Qu.:    0.0   3rd Qu.: 7.000  
 Max.   :1012.00   Max.   :508.000    Max.   :576.00   Max.   :800.000   Max.   :15500.0   Max.   :12.000  
   Year_Sold      Sale_Price       Longitude         Latitude    
 Min.   :2006   Min.   : 35000   Min.   :-93.69   Min.   :41.99  
 1st Qu.:2007   1st Qu.:130063   1st Qu.:-93.66   1st Qu.:42.02  
 Median :2008   Median :161875   Median :-93.64   Median :42.03  
 Mean   :2008   Mean   :179185   Mean   :-93.64   Mean   :42.03  
 3rd Qu.:2009   3rd Qu.:212450   3rd Qu.:-93.62   3rd Qu.:42.05  
 Max.   :2010   Max.   :755000   Max.   :-93.58   Max.   :42.06  
> Continious = subset(quantitative_subset,
+ select = c(Lot_Frontage, Lot_Area, Mas_Vnr_Area, BsmtFin_SF_1, BsmtFin_SF_2,
+           Bsmt_Unf_SF, Total_Bsmt_SF, First_Flr_SF, Second_Flr_SF,
+           Gr_Liv_Area, Garage_Area, Wood_Deck_SF, Open_Porch_SF, Enclosed_Porch,
+           Three_season_porch, Screen_Porch, Pool_Area, Misc_Val, Sale_Price))
> # There is probably an easier way to make R partition out the continuous varia-
> # bles, but I am not familiar enough with R yet to figure it out. So I just did
> # it manually for this assingment.
> 
> # Create training and testing datasets
> set.seed(123)
> atrain <- sample(2002,1001)
> train <- Continious[atrain,]
> test <- Continious[-atrain,]
> cor <- cor(na.omit(train[,-1]))
> 
> 
> #Correlation Plot
> library(corrplot)
> library(RColorBrewer)
> library("PerformanceAnalytics")
> corrplot(cor, type="upper", is.corr = FALSE,addCoef.col = 'black',
+          cl.pos = 'b',method = 'color',col=brewer.pal(n=8, name="RdYlBu"),
+          tl.cex = 0.5,number.cex = 0.4)
> 
> 
> #PCA Analysis
> pca <- prcomp(train[,-c(19)], center = TRUE, scale = TRUE)
> attributes(pca)
$names
[1] "sdev"     "rotation" "center"   "scale"    "x"       

$class
[1] "prcomp"

> pca$sdev^2 #eigen values
 [1] 3.490177708 1.843529577 1.573810225 1.140692391 1.122624662 1.009961720 1.008267830 0.969453953 0.955529702
[10] 0.900697851 0.853754493 0.797304453 0.684892541 0.583610527 0.561728105 0.335392651 0.164863925 0.003707686
> sum(pca$sdev^2) #should equal the number of variables
[1] 18
> var_explained <- (pca$sdev^2)/18#Percent variability explained by each variable
> var_explained
 [1] 0.1938987616 0.1024183098 0.0874339014 0.0633717995 0.0623680368 0.0561089844 0.0560148795 0.0538585530
 [9] 0.0530849834 0.0500387695 0.0474308052 0.0442946918 0.0380495856 0.0324228071 0.0312071169 0.0186329251
[17] 0.0091591069 0.0002059826
> 
> #Scree Plots:
> library(ggplot2)
> qplot(c(1:18), pca$sdev^2) +
+   geom_line() +
+   xlab("Principal Component") +
+   ylab("Eigen Values") +
+   ggtitle("Scree Plot")
> qplot(c(1:18), var_explained) +
+   geom_line() +
+   xlab("Principal Component") +
+   ylab("Variance Explained") +
+   ggtitle("Scree Plot")
> 
> #The first 5 PCs have eigenvalues significantly above 1 so we will include them! 
>  
> #Paired Panel Plots
> library(psych)
> pairs.panels(pca$x[,1:5], gap=0,pch=2)
> 
> #Create Biplots
> library(devtools)
> library(ggbiplot)
> ggbiplot(pca,choices = c(1,2),alpha=0)
> ggbiplot(pca,choices=c(1,3),alpha=0)
> ggbiplot(pca,choices=c(1,4),alpha=0)
> ggbiplot(pca,choices = c(1,5),alpha=0)
> 
> #Give the Sales_Price Column back to the DataSets
> trg <- predict(pca, train) 
> trg <- data.frame(train[c(19)],trg)
> tst <- predict(pca, test)
> tst <- data.frame(test[c(19)],tst)
> 
> #5 PC Model
> trainPC5 <- lm(Sale_Price ~ PC1 + PC2 + PC3 + PC4 + PC5, data=trg)
> summary(trainPC5)

Call:
lm(formula = Sale_Price ~ PC1 + PC2 + PC3 + PC4 + PC5, data = trg)

Residuals:
    Min      1Q  Median      3Q     Max 
-129641  -17099     976   17421  233412 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 176943.7     1015.3 174.272  < 2e-16 ***
PC1          32943.7      543.8  60.586  < 2e-16 ***
PC2           4824.4      748.2   6.448 1.76e-10 ***
PC3           7562.9      809.7   9.340  < 2e-16 ***
PC4           -524.9      951.1  -0.552   0.5811    
PC5           1987.3      958.8   2.073   0.0385 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 32120 on 995 degrees of freedom
Multiple R-squared:  0.7927,	Adjusted R-squared:  0.7916 
F-statistic: 760.8 on 5 and 995 DF,  p-value: < 2.2e-16

> 
> #Full Model
> trainfullPC <- lm(Sale_Price ~ ., data=trg)
> summary(trainfullPC)

Call:
lm(formula = Sale_Price ~ ., data = trg)

Residuals:
    Min      1Q  Median      3Q     Max 
-115438  -15970     987   17015  188185 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 176943.7      928.7 190.536  < 2e-16 ***
PC1          32943.7      497.3  66.240  < 2e-16 ***
PC2           4824.4      684.3   7.050 3.37e-12 ***
PC3           7562.9      740.6  10.211  < 2e-16 ***
PC4           -524.9      869.9  -0.603 0.546364    
PC5           1987.3      876.9   2.266 0.023656 *  
PC6            531.2      924.5   0.575 0.565693    
PC7            736.2      925.3   0.796 0.426424    
PC8           3538.2      943.7   3.749 0.000188 ***
PC9           -702.3      950.5  -0.739 0.460172    
PC10         -5070.1      979.0  -5.179 2.71e-07 ***
PC11          -207.8     1005.6  -0.207 0.836336    
PC12         -2244.5     1040.5  -2.157 0.031244 *  
PC13         -3115.9     1122.7  -2.775 0.005619 ** 
PC14          1986.7     1216.2   1.633 0.102685    
PC15         -8040.7     1239.7  -6.486 1.39e-10 ***
PC16          8138.2     1604.3   5.073 4.69e-07 ***
PC17        -20659.9     2288.3  -9.029  < 2e-16 ***
PC18        -11957.4    15258.9  -0.784 0.433445    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 29380 on 982 degrees of freedom
Multiple R-squared:  0.8288,	Adjusted R-squared:  0.8257 
F-statistic: 264.1 on 18 and 982 DF,  p-value: < 2.2e-16

> 
> #Predictive Model Testing:
> library(tidyverse)
> library(MASS)
> library(leaps)
> library(caret)
> predsmain <- predict(trainfullPC, tst)
> full.model <- data.frame(R2=R2(predsmain,tst $Sale_Price),
+                         RMSE=RMSE(predsmain,tst $Sale_Price),
+                         MAE=MAE(predsmain,tst $Sale_Price))
> 
> predsmain <- predict(trainPC5, tst)
> five.model <- data.frame(R2=R2(predsmain,tst $Sale_Price),
+                     RMSE=RMSE(predsmain,tst $Sale_Price),
+                     MAE=MAE(predsmain,tst $Sale_Price))
> full.model
         R2    RMSE      MAE
1 0.8046324 33686.5 23995.07
> five.model
         R2     RMSE      MAE
1 0.7722384 36352.27 25749.46
> 
> #Compare the two Models:
> complete <- rbind(full.model,five.model)
> row.names(complete) <- c("Model with 18 PCs", "Model with 5 PCs")
> complete
                         R2     RMSE      MAE
Model with 18 PCs 0.8046324 33686.50 23995.07
Model with 5 PCs  0.7722384 36352.27 25749.46
> 
> #Summary:
> # This PCA revealed that 77% of the variance in Sales Price could be explained
> # by 5 principal components. As opposed to 80% of the variance in Sales_Price
> # being explained by all 18 principal componenets. This gives reason to suggest
> # that a 5 component analysis is sufficiently predictive when compared to a more
> # fully inclusive model. 
> 
Leave a Comment