2018年2月20日星期二

Intro_To_R & RStudio

1)      Install packages
-          Pixmap
-          Tidyverse
-          Car
-          rmarkdown
2)      CRAN: r repositories and packages to be downloaded using CURL
3)      Assign value in R, the variable is case-sensitive
> four<-2+2
> four
[1] 4
> four
[1] 4

4)      Import data from a .csv file
5)      Using RStudio
### this is generated code from R Studio
> library(readr)
Warning message:
package ‘readr’ was built under R version 3.3.3 
> weightloss <- read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")
Parsed with column specification:
cols(
  weight = col_double(),
  weightpost = col_double(),
  gender = col_character(),
  group = col_character(),
  previousdiet = col_character()
)
> View(weightloss)
> 

### to view a summary of the dataset
> summary(weightloss)
     weight         weightpost        gender             group           previousdiet      
 Min.   : 65.74   Min.   : 63.07   Length:132         Length:132         Length:132        
 1st Qu.: 80.33   1st Qu.: 78.29   Class :character   Class :character   Class :character  
 Median : 84.28   Median : 82.55   Mode  :character   Mode  :character   Mode  :character  
 Mean   : 86.32   Mean   : 83.58                                                           
 3rd Qu.: 93.03   3rd Qu.: 88.02                                                           
 Max.   :110.46   Max.   :104.10                                                           
                  NA's   :82          

> hist(weightloss$weight)
> mean_weightloss
[1] 86.31773
> 
> boxplot(weightloss$weight)
> hist(weightloss$weightpost)

#### Table command
> table(weightloss$previousdiet)
 
 no yes 
 29  21 

### Vector,  Matrix , Dataframe, List
# c is a combined function, means putting the items into a colum
> a <- c(7.1,-8,3.9,4,9,5) # numeric vector
> b <- c("green","blue","red") # character vector
> c <- c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE) #logical vector

> #if we use by row=FALSE the matrix is filled by column
> matrix_2 = matrix( 
+   c(6, 2, 5, 4, 9, 7), # the data elements 
+   nrow=2,              # number of rows 
+   ncol=3,              # number of columns 
+   byrow = FALSE)       # fill matrix by rows 
> 
> matrix_2                # print the matrix

> #Data Frames
> Column_1 <-c(2, 5.5, 6, 1.3, 10, 4)
> Column_2 <-c(7, 3, 2.9, 4.5, 9.4, 1)
> 
> Dataframe_1<-data.frame(Column_1, Column_2)
> Dataframe_1

### Subsetting: indexing and values
### Fairly slow on large computing
> 
> # using vector a above we can select the 3rd element
> a[3]
[1] 3.9
> # select all elements except the 3rd element
> a[-3]
[1]  7.1 -8.0  4.0  9.0  5.0
> # select all elements whose values are < 4
> a[a<4]
[1] -8.0  3.9
> # select values equal to 9
> a[a==9]
[1] 9
> # select values greater than 2 & less than or equal to 5
> a[a>2 & a<=5]
[1] 3.9 4.0 5.0
> # select values of a less than or equal to 0 or greater than 5
> a[a<=0 | a>5]
[1]  7.1 -8.0  9.0
> # select elements 1 to 3 using the sequence operator :
> a[1:3]
[1]  7.1 -8.0  3.9
> # select elements 3 and 5
> a[c(3,5)]
[1] 3.9 9.0
> # check the class of a
> class(a)
[1] "numeric"

> #You can access the elements of a data frame using the number of the row and column entries. 
> #The row is first and the column is second, calls are made using square brackets. 
> #For example to call the element in the 3rd row of the 2nd column we use the following command 
> Dataframe_1[3,2]
[1] 2.9
> Dataframe_1
  Column_1 Column_2
1      2.0      7.0
2      5.5      3.0
3      6.0      2.9
4      1.3      4.5
5     10.0      9.4
6      4.0      1.0

### Tidyverse for data manipulation
### Book: << R for Data Science>> http://r4ds.had.co.nz/
#### Tidyverse & tibble

> # Import weightloss as a "tibbles" instead of a "dataframes"
> weightlosst <-read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")

> View(weightlosst)
> weightlosst

> select (weightlosst, weight, gender, group)
# A tibble: 132 x 3
   weight gender group      
    <dbl> <chr>  <chr>      
 1   76.2 female participated
 2   77.9 female participated
 3   78.8 female participated
 4   78.9 female participated
 5   79.0 female participated
 6   79.0 female participated
 7   79.3 female participated
 8   81.0 male   participated
 9   82.2 male   participated
10   83.1 male   participated
# ... with 122 more rows
>
>
>
> filter(weightlosst, group=="participated")
# A tibble: 50 x 5
   weight weightpost gender group        previousdiet
    <dbl>      <dbl> <chr>  <chr>        <chr>      
 1   76.2       76.6 female participated yes        
 2   77.9       63.1 female participated yes        
 3   78.8       66.6 female participated yes        
 4   78.9       69.3 female participated no         
 5   79.0       77.2 female participated yes        
 6   79.0       69.8 female participated no         
 7   79.3       71.8 female participated yes        
 8   81.0       77.9 male   participated no          
 9   82.2       82.2 male   participated no         
10   83.1       82.1 male   participated yes        
# ... with 40 more rows
>
> weightlossint1 <- filter(weightlosst, weight>80)
> weightlossint2 <- select(weightlosst, weight, gender, group)
> View(weightlossint2)
> 
> 

> # 2: nesting
> weightlossint <- select(filter(weightlosst, weight>80), weight, gender, group)
> View(weightlossint)
 
> # 3: Pipes – simplify the intermediate steps and variable savings
> weightloss_sml <- weightlosst %>%
+   filter(weight > 80) %>%
+   select(weight, gender, group)
> View(weightloss_sml)

### mutate tibbles
> # We can use pipes to create new variables using existing variables with mutate
> weightlosst %>%
+   mutate(weightg =weight*1000)
# A tibble: 132 x 6
   weight weightpost gender group        previousdiet weightg
    <dbl>      <dbl> <chr>  <chr>        <chr>          <dbl>
 1   76.2       76.6 female participated yes            76220
 2   77.9       63.1 female participated yes            77910
 3   78.8       66.6 female participated yes            78810
 4   78.9       69.3 female participated no             78890
 5   79.0       77.2 female participated yes            78950
 6   79.0       69.8 female participated no             78960
 7   79.3       71.8 female participated yes            79310
 8   81.0       77.9 male   participated no             80970
 9   82.2       82.2 male   participated no             82180
10   83.1       82.1 male   participated yes            83110
# ... with 122 more rows
### using group by the null values can be found eg.
> # Often we would want to split the data into groups and apply an analysis to the groups, we can use group_by to do this
> weightlosst %>%
+   group_by(gender) %>%
+   summarize(weight_av =mean(weight))
# A tibble: 3 x 2
  gender weight_av
  <chr>      <dbl>
1 female      85.9
2 male        92.9
3 <NA>        84.3

### Markdown and Statistical Tests
### Normality tests
> shapiro.test(weightloss$weight)

        Shapiro-Wilk normality test

data:  weightloss$weight
W = 0.98416, p-value = 0.1291

> shapiro.test(weightloss$weightpost)

        Shapiro-Wilk normality test

data:  weightloss$weightpost
W = 0.96941, p-value = 0.219


### Paired T tests

> t.test(weightloss$weight, weightloss$weightpost, paired=T)

        Paired t-test

data:  weightloss$weight and weightloss$weightpost
t = 11.309, df = 49, p-value = 2.903e-15
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 5.012088 7.178312
sample estimates:
mean of the differences
                 6.0952
>
### One sided T tests in R
> #one sided, one sample t test where the standard/population mean is 110
> t.test(weightloss$weightpost, mu=110, alternative="less")

        One Sample t-test

data:  weightloss$weightpost
t = -21.241, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean is less than 110
95 percent confidence interval:
     -Inf 85.66403
sample estimates:
mean of x
  83.5786
>

### 正态分布的相关检验
> library(car)
 
> leveneTest(weight~gender, data=weightloss)
Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group  1  0.2166 0.6438
      48               
Warning message:
In leveneTest.default(y = y, group = group, ...) : group coerced to factor.
> #R tells us that gender is not recognised as a factor however has analysed the data as if it is. 
> class(weightloss$gender)
[1] "character"

### if the Normality of the data is better, then there is Bartlett test
> #Bartlett's test
> bartlett.test(weight~Fgender, data=weightloss)
 
        Bartlett test of homogeneity of variances
 
data:  weight by Fgender
Bartlett's K-squared = 0.016271, df = 1, p-value = 0.8985

### independent sample t test unequal/equal variances
> t.test(weightloss$weight ~ weightloss$gender)

        Welch Two Sample t-test

data:  weightloss$weight by weightloss$gender
t = -3.2986, df = 47.109, p-value = 0.001855
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -11.133708  -2.698401
sample estimates:
mean in group female   mean in group male
            85.93913             92.85519

> # independent sample t test equal variances
> t.test(weightloss$weight ~ weightloss$gender, var=TRUE)

        Two Sample t-test

data:  weightloss$weight by weightloss$gender
t = -3.2915, df = 48, p-value = 0.001874
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -11.140752  -2.691357
sample estimates:
mean in group female   mean in group male
            85.93913             92.85519
>

#### ANOVA
#### ~ means “by”
#### using summary to get the values
> table(weightloss$group)
 
    eligible not eligible participated 
          41           41           50 
> boxplot(weightloss$weight~ weightloss$group)
> # check homogeneity of variance assumption using the Levene's test the CAR library
> library(car)
> #Change group to factor
> Fgroup<-factor(weightloss$group)
> class(Fgroup)
[1] "factor"
> 
> leveneTest(weight~Fgroup, data=weightloss)
Levene's Test for Homogeneity of Variance (center = median)
       Df F value Pr(>F)
group   2  0.5807  0.561
      129               
> aovgroup<-aov(weightloss$weight~weightloss$group)
> summary(aovgroup)
                  Df Sum Sq Mean Sq F value   Pr(>F)    
weightloss$group   2   1212   606.2   9.017 0.000216 ***
Residuals        129   8673    67.2                     
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 
> # Post Hoc test to determine which groups differ
> TukeyHSD(aovgroup)
  Tukey multiple comparisons of means
    95% family-wise confidence level

Fit: aov(formula = weightloss$weight ~ weightloss$group)

$`weightloss$group`
                              diff        lwr       upr     p adj
not eligible-eligible     3.863171 -0.4307754  8.157117 0.0871709
participated-eligible     7.334044  3.2378804 11.430207 0.0001221
participated-not eligible 3.470873 -0.6252903  7.567037 0.1140809
>
> # Non parametric alternative
> #the kruskal-wallis test is the non parametric alternative to ANOVA. Note need to use group as factor.
> Fgroup<-as.factor(weightloss$group)
> kruskal.test(weightloss$weight~Fgroup)

        Kruskal-Wallis rank sum test

data:  weightloss$weight by Fgroup
Kruskal-Wallis chi-squared = 15.772, df = 2, p-value = 0.000376
>
 

# Crosstabulations and the Chi Square test
> #Crosstabulations
> table(weightloss$gender, weightloss$previousdiet)
       
         no yes
  female  9  14
  male   20   7
> chisq.test(weightloss$gender, weightloss$previousdiet)

        Pearson's Chi-squared test with Yates' continuity correction

data:  weightloss$gender and weightloss$previousdiet
X-squared = 4.8738, df = 1, p-value = 0.02727

> #Constructing a crosstabulation from summary data
> priordiet<-structure(list(dieters=c(14,7), total=c(23,27)), .names=c("dieters","total"), row.names=c("female", "male"), class="data.frame")
> prop.test(priordiet$dieters, priordiet$total)

        2-sample test for equality of proportions with continuity correction

data:  priordiet$dieters out of priordiet$total
X-squared = 4.8738, df = 1, p-value = 0.02727
alternative hypothesis: two.sided
95 percent confidence interval:
 0.05013246 0.64874032
sample estimates:
   prop 1    prop 2
0.6086957 0.2592593


> # scatterplot of weight pre and weight post
> plot(weightloss$weight, weightloss$weightpost)
> correlation<-cor(weightloss$weight, weightloss$weightpost)
> correlation
[1] NA

#### solve the NA by “pairwise.complete.obs
>
> #returns NA as there is missing data in the weightloss post column,
> #use Help to look at the documentation for cor to see the missing data options
> correlation<-cor(weightloss$weight, weightloss$weightpost, use='pairwise.complete.obs')
> correlation
[1] 0.9014941


> #The cor function only gives the correlation coefficient,
> #if we wish to know the P value and/or CI associated with this coefficient we use the cor.test function
> correlation2<-cor.test(weightloss$weight, weightloss$weightpost)
> correlation2

        Pearson's product-moment correlation

data:  weightloss$weight and weightloss$weightpost
t = 14.431, df = 48, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.8318921 0.9431726
sample estimates:
      cor
0.9014941


### Regression Model
> regression<-lm(weightloss$weight~weightloss$weightpost)
> regression

Call:
lm(formula = weightloss$weight ~ weightloss$weightpost)

Coefficients:
          (Intercept)  weightloss$weightpost 
              20.1659                 0.8316 

> # additional commands for P values associated with the regression
>
> #note that plot can have several different meanings depending on what is being plotted.
>
> #adding gender
> regressionmv2<-lm(weightloss$weightpost~weightloss$weight+weightloss$previousdiet+weightloss$gender)
> regressionmv2

Call:
lm(formula = weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +
    weightloss$gender)

Coefficients:
               (Intercept)           weightloss$weight  weightloss$previousdietyes       weightloss$gendermale 
                    0.7147                      0.9053                      0.4475                      2.7671 

> # use summary to obtain the P values for the coefficients
> summary(regressionmv2)

Call:
lm(formula = weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +
    weightloss$gender)

Residuals:
     Min       1Q   Median       3Q      Max
-11.1188  -2.1084   0.0074   1.6764   7.1092

Coefficients:
                           Estimate Std. Error t value Pr(>|t|)   
(Intercept)                 0.71474    6.29633   0.114   0.9101   
weightloss$weight           0.90530    0.07309  12.386 2.96e-16 ***
weightloss$previousdietyes  0.44753    1.14830   0.390   0.6985   
weightloss$gendermale       2.76708    1.25636   2.202   0.0327 * 
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.73 on 46 degrees of freedom
  (82 observations deleted due to missingness)
Multiple R-squared:  0.8312,   Adjusted R-squared:  0.8202
F-statistic:  75.5 on 3 and 46 DF,  p-value: < 2.2e-16

> confint(regressionmv2)
                                 2.5 %    97.5 %
(Intercept)                -11.9591136 13.388597
weightloss$weight            0.7581759  1.052424
weightloss$previousdietyes  -1.8638849  2.758949
weightloss$gendermale        0.2381667  5.296003
> #regression diagnostic plots
> plot(regressionmv2)
Hit <Return> to see next plot:
Hit <Return> to see next plot:
Hit <Return> to see next plot:
Hit <Return> to see next plot:

>
> #Transforming a variable and saving and dataframe
> logweight<-log(weightloss$weight)
> weightloss2<-cbind(weightloss, logweight)
> write.csv(weightloss2, "weightloss2.csv")
> plot(weightloss2$logweight, weightloss2$weightpost)

#### Plot examples
> # Basic plots
> x<-1:100 # generate a vector sequence of numbers from 1 to 100
> y<-x^2 # generate a vector of values of vector x squared
> plot(x,y) # draw a scatterplot of x and y
> plot(x,y, type='l') # change the default type of the plot to a line
> plot(x,y, type='l',main="Wealth increase with age", xlab='age',ylab='wealth (in 1000s)', col='blue', lwd=4, lty=2)
> # here we have plotted the same plot adding labels, changing the colour (col) to blue,
> # the line width (lwd) to 4 (the default is 1)
> # and the line type (lty) to 2 (1=solid, 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=two dash.
> plot(x,y, type='l') # change the default type of the plot to a line

#### ggplot2
> library("ggplot2", lib.loc="~/R/win-library/3.3")
Stackoverflow is a great place to get help: http://stackoverflow.com/tags/ggplot2.
Warning message:
package ‘ggplot2’ was built under R version 3.3.3
>
>
> data()
> head(diamonds) the diamonds datasets
# A tibble: 6 x 10
  carat cut       color clarity depth table price     x     y     z
  <dbl> <ord>     <ord> <ord>   <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.230 Ideal     E     SI2      61.5  55.0   326  3.95  3.98  2.43
2 0.210 Premium   E     SI1      59.8  61.0   326  3.89  3.84  2.31
3 0.230 Good      E     VS1      56.9  65.0   327  4.05  4.07  2.31
4 0.290 Premium   I     VS2      62.4  58.0   334  4.20  4.23  2.63
5 0.310 Good      J     SI2      63.3  58.0   335  4.34  4.35  2.75
6 0.240 Very Good J     VVS2     62.8  57.0   336  3.94  3.96  2.48
> #make a plot using base
> plot(x=diamonds$carat, y=diamonds$price, type="p")
> title(main="base scatterplot")
>
> #make a "quick" plot using ggplot2
> qplot(x=carat, y=price, data=diamonds, geom="point") +
+ ggtitle("ggplot scatterplot")

#### to show colourful diagram using ggplot2
> # Advantages of ggplot
> # color by a continuous variable
> # First we construct our basic plot which we will call diamondplot, then we will add the colour options
> diamondplot <- ggplot(data = diamonds, aes(x = carat, y = price))
> diamondplot + geom_point(aes(colour = depth))
> # color by a factor variable
> diamondplot + geom_point(aes(colour = color))