1)     
Install packages
-         
Pixmap
-         
Tidyverse
-         
Car
-         
rmarkdown
2)     
CRAN: r repositories
and packages to be downloaded using CURL
3)     
Assign value in R,
the variable is case-sensitive
> four<-2+2
> four
[1] 4
> four
[1] 4
4)     
Import data from a
.csv file
5)     
Using RStudio
### this is generated code from R Studio
> library(readr)
Warning message:
package ‘readr’ was built under R version 3.3.3 
> weightloss <- read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")
Parsed with column specification:
cols(
  weight = col_double(),
  weightpost = col_double(),
  gender = col_character(),
  group = col_character(),
  previousdiet = col_character()
)
> View(weightloss)
>
### to view a summary of the dataset
> summary(weightloss)
     weight         weightpost        gender             group           previousdiet      
 Min.   : 65.74   Min.   : 63.07   Length:132         Length:132         Length:132        
 1st Qu.: 80.33   1st Qu.: 78.29   Class :character   Class :character   Class :character  
Median : 84.28 Median : 82.55 Mode :character Mode :character Mode :character
Mean : 86.32 Mean : 83.58
 3rd Qu.: 93.03   3rd Qu.: 88.02                                                           
 Max.   :110.46   Max.   :104.10                                                           
                  NA's   :82          
> hist(weightloss$weight)
> mean_weightloss
[1] 86.31773
>
> boxplot(weightloss$weight)
> hist(weightloss$weightpost)
#### Table command
> table(weightloss$previousdiet)
 
 no yes 
 29  21 
### Vector,  Matrix , Dataframe,
List
# c is a combined function, means putting the items into a
colum
> a <- c(7.1,-8,3.9,4,9,5) # numeric vector
> b <- c("green","blue","red") # character vector
> c <- c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE) #logical vector
> #if we use by row=FALSE the matrix is filled by column
> matrix_2 = matrix(
+ c(6, 2, 5, 4, 9, 7), # the data elements
+ nrow=2, # number of rows
+ ncol=3, # number of columns
+ byrow = FALSE) # fill matrix by rows
>
> matrix_2 # print the matrix
> #Data Frames
> Column_1 <-c(2, 5.5, 6, 1.3, 10, 4)
> Column_2 <-c(7, 3, 2.9, 4.5, 9.4, 1)
>
> Dataframe_1<-data.frame(Column_1, Column_2)
> Dataframe_1
### Subsetting: indexing and values
### Fairly slow on large computing
>
> # using vector a above we can select the 3rd element
> a[3]
[1] 3.9
> # select all elements except the 3rd element
> a[-3]
[1]  7.1 -8.0  4.0  9.0  5.0
> # select all elements whose values are < 4
> a[a<4]
[1] -8.0  3.9
> # select values equal to 9
> a[a==9]
[1] 9
> # select values greater than 2 & less than or equal to 5
> a[a>2 & a<=5]
[1] 3.9 4.0 5.0
> # select values of a less than or equal to 0 or greater than 5
> a[a<=0 | a>5]
[1]  7.1 -8.0  9.0
> # select elements 1 to 3 using the sequence operator :
> a[1:3]
[1]  7.1 -8.0  3.9
> # select elements 3 and 5
> a[c(3,5)]
[1] 3.9 9.0
> # check the class of a
> class(a)
[1] "numeric"
> #You can access the elements of a data frame using the number of the row and column entries.
> #The row is first and the column is second, calls are made using square brackets.
> #For example to call the element in the 3rd row of the 2nd column we use the following command
> Dataframe_1[3,2]
[1] 2.9
> Dataframe_1
  Column_1 Column_2
1      2.0      7.0
2      5.5      3.0
3      6.0      2.9
4      1.3      4.5
5     10.0      9.4
6      4.0      1.0
### Tidyverse for data manipulation
### Book: << R for Data Science>> http://r4ds.had.co.nz/
#### Tidyverse & tibble
> # Import weightloss as a "tibbles" instead of a "dataframes"
> weightlosst <-read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")
> View(weightlosst)
> weightlosst
| 
   
> select (weightlosst, weight, gender, group) 
# A tibble: 132 x 3 
   weight gender
  group        
    <dbl>
  <chr>  <chr>        
 1   76.2 female participated 
 2   77.9 female participated 
 3   78.8 female participated 
 4   78.9 female participated 
 5   79.0 female participated 
 6   79.0 female participated 
 7   79.3 female participated 
 8   81.0 male  
  participated 
 9   82.2 male  
  participated 
10   83.1 male   participated 
# ... with 122 more rows 
>  
>  
>  
> filter(weightlosst, group=="participated") 
# A tibble: 50 x 5 
   weight weightpost gender
  group        previousdiet 
    <dbl>      <dbl> <chr>  <chr>        <chr>        
 1   76.2      
  76.6 female participated yes        
   
 2   77.9      
  63.1 female participated yes        
   
 3   78.8      
  66.6 female participated yes        
   
 4   78.9      
  69.3 female participated no         
   
 5   79.0      
  77.2 female participated yes        
   
 6   79.0      
  69.8 female participated no         
   
 7   79.3      
  71.8 female participated yes        
   
 8   81.0      
  77.9 male   participated no           
 9   82.2      
  82.2 male   participated no           
10   83.1       82.1 male   participated yes          
# ... with 40 more rows 
 | 
 |
  
  | 
 
> weightlossint1 <- filter(weightlosst, weight>80)
> weightlossint2 <- select(weightlosst, weight, gender, group)
> View(weightlossint2)
>
>
> # 2: nesting
> weightlossint <- select(filter(weightlosst, weight>80), weight, gender, group)
> View(weightlossint)
 
> # 3: Pipes – simplify the intermediate steps and variable savings
> weightloss_sml <- weightlosst %>%
+ filter(weight > 80) %>%
+ select(weight, gender, group)
> View(weightloss_sml)
### mutate tibbles
| 
   
> # We can use pipes to create new variables using existing
  variables with mutate 
> weightlosst %>% 
+   mutate(weightg
  =weight*1000) 
# A tibble: 132 x 6 
   weight weightpost
  gender group        previousdiet
  weightg 
    <dbl>      <dbl> <chr>  <chr>        <chr>          <dbl> 
 1   76.2      
  76.6 female participated yes           
  76220 
 2   77.9      
  63.1 female participated yes           
  77910 
 3   78.8      
  66.6 female participated yes           
  78810 
 4   78.9      
  69.3 female participated no            
  78890 
 5   79.0      
  77.2 female participated yes           
  78950 
 6   79.0      
  69.8 female participated no            
  78960 
 7   79.3  
      71.8 female participated
  yes            79310 
 8   81.0      
  77.9 male   participated no             80970 
 9   82.2      
  82.2 male   participated no             82180 
10   83.1       82.1 male   participated yes            83110 
# ... with 122 more rows 
 | 
 |
| 
   | 
 
### using group by the null values can be found eg.
> # Often we would want to split the data into groups and apply an analysis to the groups, we can use group_by to do this
> weightlosst %>%
+ group_by(gender) %>%
+ summarize(weight_av =mean(weight))
# A tibble: 3 x 2
  gender weight_av
  <chr>      <dbl>
1 female      85.9
2 male        92.9
3 <NA>        84.3
### Markdown and Statistical Tests
### Normality tests
| 
   
> shapiro.test(weightloss$weight) 
        Shapiro-Wilk
  normality test 
data:  weightloss$weight 
W = 0.98416, p-value = 0.1291 
> shapiro.test(weightloss$weightpost) 
        Shapiro-Wilk
  normality test 
data: 
  weightloss$weightpost 
W = 0.96941, p-value = 0.219 
### Paired T tests 
> t.test(weightloss$weight, weightloss$weightpost, paired=T) 
        Paired t-test 
data:  weightloss$weight
  and weightloss$weightpost 
t = 11.309, df = 49, p-value = 2.903e-15 
alternative hypothesis: true difference in means is not equal to
  0 
95 percent confidence interval: 
 5.012088 7.178312 
sample estimates: 
mean of the differences  
                 6.0952  
 | 
 |
  
  | 
 
### One sided T tests in R 
| 
   
> #one sided, one sample t test where the standard/population
  mean is 110 
> t.test(weightloss$weightpost, mu=110,
  alternative="less") 
        One Sample t-test 
data:  weightloss$weightpost 
t = -21.241, df = 49, p-value < 2.2e-16 
alternative hypothesis: true mean is less than 110 
95 percent confidence interval: 
     -Inf 85.66403 
sample estimates: 
mean of x  
  83.5786  
 | 
 |
  
  | 
 
### 正态分布的相关检验
> library(car)
 
> leveneTest(weight~gender, data=weightloss)
Levene's Test for Homogeneity of Variance (center = median)
      Df F value Pr(>F)
group  1  0.2166 0.6438
      48               
Warning message:
In leveneTest.default(y = y, group = group, ...) : group coerced to factor.
> #R tells us that gender is not recognised as a factor however has analysed the data as if it is.
> class(weightloss$gender)
[1] "character"
### if the Normality of the data is better, then there is
Bartlett test
> #Bartlett's test
> bartlett.test(weight~Fgender, data=weightloss)
 
        Bartlett test of homogeneity of variances
 
data:  weight by Fgender
Bartlett's K-squared = 0.016271, df = 1, p-value = 0.8985
### independent sample t test unequal/equal variances
| 
   
> t.test(weightloss$weight ~ weightloss$gender) 
        Welch Two Sample
  t-test 
data:  weightloss$weight
  by weightloss$gender 
t = -3.2986, df = 47.109, p-value = 0.001855 
alternative hypothesis: true difference in means is not equal to
  0 
95 percent confidence interval: 
 -11.133708  -2.698401 
sample estimates: 
mean in group female  
  mean in group male  
            85.93913             92.85519  
> # independent sample t test equal variances 
> t.test(weightloss$weight ~ weightloss$gender, var=TRUE) 
        Two Sample t-test 
data:  weightloss$weight
  by weightloss$gender 
t = -3.2915, df = 48, p-value = 0.001874 
alternative hypothesis: true difference in means is not equal to
  0 
95 percent confidence interval: 
 -11.140752  -2.691357 
sample estimates: 
mean in group female  
  mean in group male  
            85.93913             92.85519  
 | 
 |
  
  | 
 
#### ANOVA
#### ~ means “by”
#### using summary to get the values
> table(weightloss$group)
 
    eligible not eligible participated 
          41           41           50 
> boxplot(weightloss$weight~ weightloss$group)
> # check homogeneity of variance assumption using the Levene's test the CAR library
> library(car)
> #Change group to factor
> Fgroup<-factor(weightloss$group)
> class(Fgroup)
[1] "factor"
>
> leveneTest(weight~Fgroup, data=weightloss)
Levene's Test for Homogeneity of Variance (center = median)
       Df F value Pr(>F)
group   2  0.5807  0.561
      129               
> aovgroup<-aov(weightloss$weight~weightloss$group)
> summary(aovgroup)
                  Df Sum Sq Mean Sq F value   Pr(>F)    
weightloss$group   2   1212   606.2   9.017 0.000216 ***
Residuals        129   8673    67.2                     
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
 
| 
   
> # Post Hoc test to determine which groups differ 
> TukeyHSD(aovgroup) 
  Tukey multiple
  comparisons of means 
    95% family-wise
  confidence level 
Fit: aov(formula = weightloss$weight ~ weightloss$group) 
$`weightloss$group` 
                             
  diff        lwr       upr    
  p adj 
not eligible-eligible    
  3.863171 -0.4307754  8.157117
  0.0871709 
participated-eligible    
  7.334044  3.2378804 11.430207
  0.0001221 
participated-not eligible 3.470873 -0.6252903  7.567037 0.1140809 
 | 
 |
  
  | 
 |
| 
   
> # Non parametric alternative 
> #the kruskal-wallis test is the non parametric alternative
  to ANOVA. Note need to use group as factor. 
> Fgroup<-as.factor(weightloss$group) 
> kruskal.test(weightloss$weight~Fgroup) 
        Kruskal-Wallis rank
  sum test 
data:  weightloss$weight
  by Fgroup 
Kruskal-Wallis chi-squared = 15.772, df = 2, p-value = 0.000376 
 | 
 |
  
  | 
 
 
# Crosstabulations and the Chi
Square test
> #Crosstabulations
> table(weightloss$gender,
weightloss$previousdiet)
         no yes
  female 
9  14
  male  
20   7
>
chisq.test(weightloss$gender, weightloss$previousdiet)
        Pearson's Chi-squared test with Yates' continuity correction
data:  weightloss$gender and weightloss$previousdiet
X-squared = 4.8738, df = 1,
p-value = 0.02727
> #Constructing a
crosstabulation from summary data
>
priordiet<-structure(list(dieters=c(14,7), total=c(23,27)),
.names=c("dieters","total"),
row.names=c("female", "male"),
class="data.frame")
>
prop.test(priordiet$dieters, priordiet$total)
        2-sample test for equality of proportions with continuity
correction
data:  priordiet$dieters out of priordiet$total
X-squared = 4.8738, df = 1,
p-value = 0.02727
alternative hypothesis:
two.sided
95 percent confidence interval:
 0.05013246 0.64874032
sample estimates:
   prop 1   
prop 2 
0.6086957 0.2592593 
| 
   > # scatterplot of weight pre and weight post 
> plot(weightloss$weight, weightloss$weightpost) 
> correlation<-cor(weightloss$weight,
  weightloss$weightpost) 
> correlation 
[1] NA 
#### solve the NA by “pairwise.complete.obs” 
 | 
 |
  
  | 
 |
| 
   
> #returns NA as there is missing data in the weightloss post
  column,  
> #use Help to look at the documentation for cor to see the
  missing data options 
> correlation<-cor(weightloss$weight,
  weightloss$weightpost, use='pairwise.complete.obs') 
> correlation 
[1] 0.9014941 
 | 
 |
| 
   
> #The cor function only
  gives the correlation coefficient,  
> #if we wish to know the P
  value and/or CI associated with this coefficient we use the cor.test function 
>
  correlation2<-cor.test(weightloss$weight, weightloss$weightpost) 
> correlation2 
        Pearson's product-moment correlation 
data:  weightloss$weight and weightloss$weightpost 
t =
  14.431, df = 48, p-value < 2.2e-16 
alternative hypothesis: true
  correlation is not equal to 0 
95 percent confidence
  interval: 
 0.8318921 0.9431726 
sample estimates: 
      cor  
0.9014941  
### Regression Model 
>
  regression<-lm(weightloss$weight~weightloss$weightpost) 
> regression 
Call: 
lm(formula =
  weightloss$weight ~ weightloss$weightpost) 
Coefficients: 
          (Intercept)  weightloss$weightpost   
              20.1659                 0.8316   
> # additional commands for
  P values associated with the regression 
>  
> #note that plot can have
  several different meanings depending on what is being plotted. 
>  
> #adding gender 
>
  regressionmv2<-lm(weightloss$weightpost~weightloss$weight+weightloss$previousdiet+weightloss$gender) 
> regressionmv2 
Call: 
lm(formula =
  weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +  
    weightloss$gender) 
Coefficients: 
               (Intercept)           weightloss$weight  weightloss$previousdietyes       weightloss$gendermale   
                    0.7147                      0.9053                      0.4475                      2.7671   
> # use summary to obtain
  the P values for the coefficients 
> summary(regressionmv2) 
Call: 
lm(formula =
  weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +  
    weightloss$gender) 
Residuals: 
     Min      
  1Q   Median       3Q     
  Max  
-11.1188  -2.1084  
  0.0074   1.6764   7.1092  
Coefficients: 
                           Estimate Std.
  Error t value Pr(>|t|)     
(Intercept)                 0.71474    6.29633  
  0.114   0.9101     
weightloss$weight           0.90530    0.07309 
  12.386 2.96e-16 *** 
weightloss$previousdietyes  0.44753   
  1.14830   0.390   0.6985   
   
weightloss$gendermale       2.76708    1.25636  
  2.202   0.0327 *   
--- 
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘
  ’ 1 
Residual standard error: 3.73
  on 46 degrees of freedom 
  (82 observations deleted due to
  missingness) 
Multiple R-squared:  0.8312,   Adjusted
  R-squared:  0.8202  
F-statistic:  75.5 on 3 and 46 DF,  p-value: < 2.2e-16 
> confint(regressionmv2) 
                                 2.5 %    97.5 % 
(Intercept)                -11.9591136 13.388597 
weightloss$weight            0.7581759  1.052424 
weightloss$previousdietyes  -1.8638849 
  2.758949 
weightloss$gendermale        0.2381667  5.296003 
> #regression diagnostic
  plots 
> plot(regressionmv2) 
Hit <Return> to see next
  plot:  
Hit <Return> to see next
  plot:  
Hit <Return> to see next
  plot:  
Hit <Return> to see next
  plot:  
 | 
 |
| 
   | 
 
> 
> #Transforming a variable
and saving and dataframe
>
logweight<-log(weightloss$weight)
>
weightloss2<-cbind(weightloss, logweight)
> write.csv(weightloss2,
"weightloss2.csv")
> plot(weightloss2$logweight,
weightloss2$weightpost)
#### Plot examples
| 
   
> # Basic plots 
> x<-1:100 # generate a vector sequence of numbers from 1
  to 100 
> y<-x^2 # generate a vector of values of vector x squared 
> plot(x,y) # draw a scatterplot of x and y 
> plot(x,y, type='l') # change the default type of the plot to
  a line  
> plot(x,y, type='l',main="Wealth increase with
  age", xlab='age',ylab='wealth (in 1000s)', col='blue', lwd=4, lty=2) 
> # here we have plotted the same plot adding labels, changing
  the colour (col) to blue,  
> # the line width (lwd) to 4 (the default is 1)  
> # and the line type (lty) to 2 (1=solid, 2=dashed, 3=dotted,
  4=dotdash, 5=longdash, 6=two dash. 
> plot(x,y, type='l') # change the default type of the plot to
  a line 
 | 
 |
| 
   | 
 
#### ggplot2
| 
   
> library("ggplot2",
  lib.loc="~/R/win-library/3.3") 
Stackoverflow is a great place to get help:
  http://stackoverflow.com/tags/ggplot2. 
Warning message: 
package ‘ggplot2’ was built under R version 3.3.3  
>  
>  
> data() 
 | 
 |
| 
   | 
 |
| 
   
> head(diamonds) the diamonds datasets 
# A tibble: 6 x 10 
  carat cut       color clarity depth table price     x    
  y     z 
  <dbl>
  <ord>     <ord>
  <ord>   <dbl> <dbl>
  <int> <dbl> <dbl> <dbl> 
1 0.230 Ideal     E    
  SI2      61.5  55.0  
  326  3.95  3.98 
  2.43 
2 0.210 Premium   E    
  SI1      59.8  61.0  
  326  3.89  3.84 
  2.31 
3 0.230 Good      E     VS1     
  56.9  65.0   327 
  4.05  4.07  2.31 
4 0.290 Premium   I     VS2     
  62.4  58.0   334 
  4.20  4.23  2.63 
5 0.310 Good      J     SI2     
  63.3  58.0   335 
  4.34  4.35  2.75 
6 0.240 Very Good J    
  VVS2     62.8  57.0  
  336  3.94  3.96 
  2.48 
> #make a plot using base 
> plot(x=diamonds$carat, y=diamonds$price, type="p") 
> title(main="base scatterplot") 
 | 
 |
  
 
> #make a "quick"
  plot using ggplot2 
> qplot(x=carat, y=price,
  data=diamonds, geom="point") + 
+ ggtitle("ggplot
  scatterplot") 
 | 
 
#### to show colourful diagram using ggplot2
> # Advantages of ggplot
> # color by a continuous
variable
> # First we construct our
basic plot which we will call diamondplot, then we will add the colour options
> diamondplot <-
ggplot(data = diamonds, aes(x = carat, y = price))
> diamondplot +
geom_point(aes(colour = depth))
> # color by a factor
variable
> diamondplot +
geom_point(aes(colour = color))