The goal for today is to calculate a PCA model. First, let’s load our packages:

# Load tidyverse and lavaan packages

library(tidyverse)
library(psych)

Let’s load in some data:

health <- read_csv(file = "health.csv")
## Rows: 750 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (6): sleep, exercise, calories, steps, conditions, BMI
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(health)
## # A tibble: 6 × 6
##   sleep exercise calories steps conditions   BMI
##   <dbl>    <dbl>    <dbl> <dbl>      <dbl> <dbl>
## 1  5.29     19.8    1700. 4999.      1.15   18.9
## 2  5.48     20.1    1699. 5001.      1.23   17.8
## 3  4.74     19.4    1701. 5000.     -0.127  18.6
## 4  4.63     20.5    1700. 5000.      2.71   19.9
## 5  2.95     17.7    1699. 4997.      0.511  17.5
## 6  5.62     19.8    1698. 5000.      0.243  17.4

Now let’s view a sample correlation matrix (i.e., our input matrix):

cormat <- round(cor(health),2)
cormat
##            sleep exercise calories steps conditions  BMI
## sleep       1.00     0.70     0.45  0.27       0.25 0.20
## exercise    0.70     1.00     0.63  0.40       0.33 0.22
## calories    0.45     0.63     1.00  0.20       0.25 0.15
## steps       0.27     0.40     0.20  1.00       0.13 0.06
## conditions  0.25     0.33     0.25  0.13       1.00 0.06
## BMI         0.20     0.22     0.15  0.06       0.06 1.00

View the eigenvectors and eigenvalues:

eigenmatrix <- (eigen(cor(health)))
eigenmatrix
## eigen() decomposition
## $values
## [1] 2.6040541 0.9588947 0.8822176 0.7779127 0.5432352 0.2336856
## 
## $vectors
##            [,1]        [,2]        [,3]       [,4]         [,5]        [,6]
## [1,] -0.4965548  0.03948019 -0.04180044  0.2388947  0.692712758 -0.46174209
## [2,] -0.5620992 -0.03432599 -0.05155256  0.1533499  0.080525024  0.80635481
## [3,] -0.4618974 -0.01554964  0.13778697  0.4313996 -0.688711338 -0.32710071
## [4,] -0.3090910 -0.31038011 -0.69796096 -0.5157340 -0.173385352 -0.15790316
## [5,] -0.2953985 -0.27673744  0.69922792 -0.5859317  0.006796158 -0.06224333
## [6,] -0.2033006  0.90780054 -0.02325116 -0.3521505 -0.096087359 -0.02799378
round(eigenmatrix$values,3)
## [1] 2.604 0.959 0.882 0.778 0.543 0.234
eigenmatrix$vectors
##            [,1]        [,2]        [,3]       [,4]         [,5]        [,6]
## [1,] -0.4965548  0.03948019 -0.04180044  0.2388947  0.692712758 -0.46174209
## [2,] -0.5620992 -0.03432599 -0.05155256  0.1533499  0.080525024  0.80635481
## [3,] -0.4618974 -0.01554964  0.13778697  0.4313996 -0.688711338 -0.32710071
## [4,] -0.3090910 -0.31038011 -0.69796096 -0.5157340 -0.173385352 -0.15790316
## [5,] -0.2953985 -0.27673744  0.69922792 -0.5859317  0.006796158 -0.06224333
## [6,] -0.2033006  0.90780054 -0.02325116 -0.3521505 -0.096087359 -0.02799378
# Check the sum of the eigenvalues
sum(eigenmatrix$values)
## [1] 6
round(eigenmatrix$vectors,2)
##       [,1]  [,2]  [,3]  [,4]  [,5]  [,6]
## [1,] -0.50  0.04 -0.04  0.24  0.69 -0.46
## [2,] -0.56 -0.03 -0.05  0.15  0.08  0.81
## [3,] -0.46 -0.02  0.14  0.43 -0.69 -0.33
## [4,] -0.31 -0.31 -0.70 -0.52 -0.17 -0.16
## [5,] -0.30 -0.28  0.70 -0.59  0.01 -0.06
## [6,] -0.20  0.91 -0.02 -0.35 -0.10 -0.03
# Calculate the proportion of variance explained by each component
(eigenmatrix$values/sum(eigenmatrix$values))*100
## [1] 43.40090 15.98158 14.70363 12.96521  9.05392  3.89476

Let’s check the number of eigenvalues:

eigen_res <- eigen(cor(health))
round(eigen_res$values,3)
## [1] 2.604 0.959 0.882 0.778 0.543 0.234

Determining the number of components to retain

# Scree plot
plot(eigenmatrix$values, # Vector of eigenvalues
     type = 'b', # Display points and lines
     pch = 16, # 
     main = "Scree plot", 
     ylab = "Eigenvalues")

# Minimum average partial
vssmodel <- vss(health)

# Parallel analysis
fa.parallel(health, n.iter = 500)

## Parallel analysis suggests that the number of factors =  1  and the number of components =  1

Running PCA and comparing two solutions:

PC1 <- principal(health, nfactors = 1, rotate = "none")
PC2 <- principal(health, nfactors = 2, rotate = "none")

PC1
## Principal Components Analysis
## Call: principal(r = health, nfactors = 1, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1   h2   u2 com
## sleep      0.80 0.64 0.36   1
## exercise   0.91 0.82 0.18   1
## calories   0.75 0.56 0.44   1
## steps      0.50 0.25 0.75   1
## conditions 0.48 0.23 0.77   1
## BMI        0.33 0.11 0.89   1
## 
##                 PC1
## SS loadings    2.60
## Proportion Var 0.43
## 
## Mean item complexity =  1
## Test of the hypothesis that 1 component is sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.11 
##  with the empirical chi square  253.34  with prob <  2e-49 
## 
## Fit based upon off diagonal values = 0.9
PC2
## Principal Components Analysis
## Call: principal(r = health, nfactors = 2, rotate = "none")
## Standardized loadings (pattern matrix) based upon correlation matrix
##             PC1   PC2   h2   u2 com
## sleep      0.80  0.04 0.64 0.36 1.0
## exercise   0.91 -0.03 0.82 0.18 1.0
## calories   0.75 -0.02 0.56 0.44 1.0
## steps      0.50 -0.30 0.34 0.66 1.7
## conditions 0.48 -0.27 0.30 0.70 1.6
## BMI        0.33  0.89 0.90 0.10 1.3
## 
##                        PC1  PC2
## SS loadings           2.60 0.96
## Proportion Var        0.43 0.16
## Cumulative Var        0.43 0.59
## Proportion Explained  0.73 0.27
## Cumulative Proportion 0.73 1.00
## 
## Mean item complexity =  1.3
## Test of the hypothesis that 2 components are sufficient.
## 
## The root mean square of the residuals (RMSR) is  0.12 
##  with the empirical chi square  333.17  with prob <  7.5e-71 
## 
## Fit based upon off diagonal values = 0.87

Evaluating our solution:

PC1$loadings
## 
## Loadings:
##            PC1  
## sleep      0.801
## exercise   0.907
## calories   0.745
## steps      0.499
## conditions 0.477
## BMI        0.328
## 
##                  PC1
## SS loadings    2.604
## Proportion Var 0.434
PC2$loadings
## 
## Loadings:
##            PC1    PC2   
## sleep       0.801       
## exercise    0.907       
## calories    0.745       
## steps       0.499 -0.304
## conditions  0.477 -0.271
## BMI         0.328  0.889
## 
##                  PC1   PC2
## SS loadings    2.604 0.959
## Proportion Var 0.434 0.160
## Cumulative Var 0.434 0.594
PC2$values
## [1] 2.6040541 0.9588947 0.8822176 0.7779127 0.5432352 0.2336856
round(PC2$values/sum(PC2$values),3)
## [1] 0.434 0.160 0.147 0.130 0.091 0.039

Saving the scores:

scores <- PC2$scores
head(scores, 10)
##               PC1        PC2
##  [1,]  0.07909052  1.0939224
##  [2,]  0.17914761 -0.7393742
##  [3,] -0.02685503  0.8161932
##  [4,]  0.69868310  1.2197222
##  [5,] -2.41480329  0.5284260
##  [6,] -0.59492854 -0.2704594
##  [7,]  0.14903848  0.5648958
##  [8,] -1.44757185 -0.8904365
##  [9,] -0.67902075 -2.3279539
## [10,]  1.10254370 -1.0786089
head(health)
## # A tibble: 6 × 6
##   sleep exercise calories steps conditions   BMI
##   <dbl>    <dbl>    <dbl> <dbl>      <dbl> <dbl>
## 1  5.29     19.8    1700. 4999.      1.15   18.9
## 2  5.48     20.1    1699. 5001.      1.23   17.8
## 3  4.74     19.4    1701. 5000.     -0.127  18.6
## 4  4.63     20.5    1700. 5000.      2.71   19.9
## 5  2.95     17.7    1699. 4997.      0.511  17.5
## 6  5.62     19.8    1698. 5000.      0.243  17.4