1)
Install packages
-
Pixmap
-
Tidyverse
-
Car
-
rmarkdown
2)
CRAN: r repositories
and packages to be downloaded using CURL
3)
Assign value in R,
the variable is case-sensitive
> four<-2+2
> four
[1] 4
> four
[1] 4
4)
Import data from a
.csv file
5)
Using RStudio
### this is generated code from R Studio
> library(readr)
Warning message:
package ‘readr’ was built under R version 3.3.3
> weightloss <- read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")
Parsed with column specification:
cols(
weight = col_double(),
weightpost = col_double(),
gender = col_character(),
group = col_character(),
previousdiet = col_character()
)
> View(weightloss)
>
### to view a summary of the dataset
> summary(weightloss)
weight weightpost gender group previousdiet
Min. : 65.74 Min. : 63.07 Length:132 Length:132 Length:132
1st Qu.: 80.33 1st Qu.: 78.29 Class :character Class :character Class :character
Median : 84.28 Median : 82.55 Mode :character Mode :character Mode :character
Mean : 86.32 Mean : 83.58
3rd Qu.: 93.03 3rd Qu.: 88.02
Max. :110.46 Max. :104.10
NA's :82
> hist(weightloss$weight)
> mean_weightloss
[1] 86.31773
>
> boxplot(weightloss$weight)
> hist(weightloss$weightpost)
#### Table command
> table(weightloss$previousdiet)
no yes
29 21
### Vector, Matrix , Dataframe,
List
# c is a combined function, means putting the items into a
colum
> a <- c(7.1,-8,3.9,4,9,5) # numeric vector
> b <- c("green","blue","red") # character vector
> c <- c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE) #logical vector
> #if we use by row=FALSE the matrix is filled by column
> matrix_2 = matrix(
+ c(6, 2, 5, 4, 9, 7), # the data elements
+ nrow=2, # number of rows
+ ncol=3, # number of columns
+ byrow = FALSE) # fill matrix by rows
>
> matrix_2 # print the matrix
> #Data Frames
> Column_1 <-c(2, 5.5, 6, 1.3, 10, 4)
> Column_2 <-c(7, 3, 2.9, 4.5, 9.4, 1)
>
> Dataframe_1<-data.frame(Column_1, Column_2)
> Dataframe_1
### Subsetting: indexing and values
### Fairly slow on large computing
>
> # using vector a above we can select the 3rd element
> a[3]
[1] 3.9
> # select all elements except the 3rd element
> a[-3]
[1] 7.1 -8.0 4.0 9.0 5.0
> # select all elements whose values are < 4
> a[a<4]
[1] -8.0 3.9
> # select values equal to 9
> a[a==9]
[1] 9
> # select values greater than 2 & less than or equal to 5
> a[a>2 & a<=5]
[1] 3.9 4.0 5.0
> # select values of a less than or equal to 0 or greater than 5
> a[a<=0 | a>5]
[1] 7.1 -8.0 9.0
> # select elements 1 to 3 using the sequence operator :
> a[1:3]
[1] 7.1 -8.0 3.9
> # select elements 3 and 5
> a[c(3,5)]
[1] 3.9 9.0
> # check the class of a
> class(a)
[1] "numeric"
> #You can access the elements of a data frame using the number of the row and column entries.
> #The row is first and the column is second, calls are made using square brackets.
> #For example to call the element in the 3rd row of the 2nd column we use the following command
> Dataframe_1[3,2]
[1] 2.9
> Dataframe_1
Column_1 Column_2
1 2.0 7.0
2 5.5 3.0
3 6.0 2.9
4 1.3 4.5
5 10.0 9.4
6 4.0 1.0
### Tidyverse for data manipulation
### Book: << R for Data Science>> http://r4ds.had.co.nz/
#### Tidyverse & tibble
> # Import weightloss as a "tibbles" instead of a "dataframes"
> weightlosst <-read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")
> View(weightlosst)
> weightlosst
> select (weightlosst, weight, gender, group)
# A tibble: 132 x 3
weight gender
group
<dbl>
<chr> <chr>
1 76.2 female participated
2 77.9 female participated
3 78.8 female participated
4 78.9 female participated
5 79.0 female participated
6 79.0 female participated
7 79.3 female participated
8 81.0 male
participated
9 82.2 male
participated
10 83.1 male participated
# ... with 122 more rows
>
>
>
> filter(weightlosst, group=="participated")
# A tibble: 50 x 5
weight weightpost gender
group previousdiet
<dbl> <dbl> <chr> <chr> <chr>
1 76.2
76.6 female participated yes
2 77.9
63.1 female participated yes
3 78.8
66.6 female participated yes
4 78.9
69.3 female participated no
5 79.0
77.2 female participated yes
6 79.0
69.8 female participated no
7 79.3
71.8 female participated yes
8 81.0
77.9 male participated no
9 82.2
82.2 male participated no
10 83.1 82.1 male participated yes
# ... with 40 more rows
|
|
|
> weightlossint1 <- filter(weightlosst, weight>80)
> weightlossint2 <- select(weightlosst, weight, gender, group)
> View(weightlossint2)
>
>
> # 2: nesting
> weightlossint <- select(filter(weightlosst, weight>80), weight, gender, group)
> View(weightlossint)
> # 3: Pipes – simplify the intermediate steps and variable savings
> weightloss_sml <- weightlosst %>%
+ filter(weight > 80) %>%
+ select(weight, gender, group)
> View(weightloss_sml)
### mutate tibbles
> # We can use pipes to create new variables using existing
variables with mutate
> weightlosst %>%
+ mutate(weightg
=weight*1000)
# A tibble: 132 x 6
weight weightpost
gender group previousdiet
weightg
<dbl> <dbl> <chr> <chr> <chr> <dbl>
1 76.2
76.6 female participated yes
76220
2 77.9
63.1 female participated yes
77910
3 78.8
66.6 female participated yes
78810
4 78.9
69.3 female participated no
78890
5 79.0
77.2 female participated yes
78950
6 79.0
69.8 female participated no
78960
7 79.3
71.8 female participated
yes 79310
8 81.0
77.9 male participated no 80970
9 82.2
82.2 male participated no 82180
10 83.1 82.1 male participated yes 83110
# ... with 122 more rows
|
|
|
### using group by the null values can be found eg.
> # Often we would want to split the data into groups and apply an analysis to the groups, we can use group_by to do this
> weightlosst %>%
+ group_by(gender) %>%
+ summarize(weight_av =mean(weight))
# A tibble: 3 x 2
gender weight_av
<chr> <dbl>
1 female 85.9
2 male 92.9
3 <NA> 84.3
### Markdown and Statistical Tests
### Normality tests
> shapiro.test(weightloss$weight)
Shapiro-Wilk
normality test
data: weightloss$weight
W = 0.98416, p-value = 0.1291
> shapiro.test(weightloss$weightpost)
Shapiro-Wilk
normality test
data:
weightloss$weightpost
W = 0.96941, p-value = 0.219
### Paired T tests
> t.test(weightloss$weight, weightloss$weightpost, paired=T)
Paired t-test
data: weightloss$weight
and weightloss$weightpost
t = 11.309, df = 49, p-value = 2.903e-15
alternative hypothesis: true difference in means is not equal to
0
95 percent confidence interval:
5.012088 7.178312
sample estimates:
mean of the differences
6.0952
|
|
|
### One sided T tests in R
> #one sided, one sample t test where the standard/population
mean is 110
> t.test(weightloss$weightpost, mu=110,
alternative="less")
One Sample t-test
data: weightloss$weightpost
t = -21.241, df = 49, p-value < 2.2e-16
alternative hypothesis: true mean is less than 110
95 percent confidence interval:
-Inf 85.66403
sample estimates:
mean of x
83.5786
|
|
|
### 正态分布的相关检验
> library(car)
> leveneTest(weight~gender, data=weightloss)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 1 0.2166 0.6438
48
Warning message:
In leveneTest.default(y = y, group = group, ...) : group coerced to factor.
> #R tells us that gender is not recognised as a factor however has analysed the data as if it is.
> class(weightloss$gender)
[1] "character"
### if the Normality of the data is better, then there is
Bartlett test
> #Bartlett's test
> bartlett.test(weight~Fgender, data=weightloss)
Bartlett test of homogeneity of variances
data: weight by Fgender
Bartlett's K-squared = 0.016271, df = 1, p-value = 0.8985
### independent sample t test unequal/equal variances
> t.test(weightloss$weight ~ weightloss$gender)
Welch Two Sample
t-test
data: weightloss$weight
by weightloss$gender
t = -3.2986, df = 47.109, p-value = 0.001855
alternative hypothesis: true difference in means is not equal to
0
95 percent confidence interval:
-11.133708 -2.698401
sample estimates:
mean in group female
mean in group male
85.93913 92.85519
> # independent sample t test equal variances
> t.test(weightloss$weight ~ weightloss$gender, var=TRUE)
Two Sample t-test
data: weightloss$weight
by weightloss$gender
t = -3.2915, df = 48, p-value = 0.001874
alternative hypothesis: true difference in means is not equal to
0
95 percent confidence interval:
-11.140752 -2.691357
sample estimates:
mean in group female
mean in group male
85.93913 92.85519
|
|
|
#### ANOVA
#### ~ means “by”
#### using summary to get the values
> table(weightloss$group)
eligible not eligible participated
41 41 50
> boxplot(weightloss$weight~ weightloss$group)
> # check homogeneity of variance assumption using the Levene's test the CAR library
> library(car)
> #Change group to factor
> Fgroup<-factor(weightloss$group)
> class(Fgroup)
[1] "factor"
>
> leveneTest(weight~Fgroup, data=weightloss)
Levene's Test for Homogeneity of Variance (center = median)
Df F value Pr(>F)
group 2 0.5807 0.561
129
> aovgroup<-aov(weightloss$weight~weightloss$group)
> summary(aovgroup)
Df Sum Sq Mean Sq F value Pr(>F)
weightloss$group 2 1212 606.2 9.017 0.000216 ***
Residuals 129 8673 67.2
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
> # Post Hoc test to determine which groups differ
> TukeyHSD(aovgroup)
Tukey multiple
comparisons of means
95% family-wise
confidence level
Fit: aov(formula = weightloss$weight ~ weightloss$group)
$`weightloss$group`
diff lwr upr
p adj
not eligible-eligible
3.863171 -0.4307754 8.157117
0.0871709
participated-eligible
7.334044 3.2378804 11.430207
0.0001221
participated-not eligible 3.470873 -0.6252903 7.567037 0.1140809
|
|
|
|
> # Non parametric alternative
> #the kruskal-wallis test is the non parametric alternative
to ANOVA. Note need to use group as factor.
> Fgroup<-as.factor(weightloss$group)
> kruskal.test(weightloss$weight~Fgroup)
Kruskal-Wallis rank
sum test
data: weightloss$weight
by Fgroup
Kruskal-Wallis chi-squared = 15.772, df = 2, p-value = 0.000376
|
|
|
# Crosstabulations and the Chi
Square test
> #Crosstabulations
> table(weightloss$gender,
weightloss$previousdiet)
no yes
female
9 14
male
20 7
>
chisq.test(weightloss$gender, weightloss$previousdiet)
Pearson's Chi-squared test with Yates' continuity correction
data: weightloss$gender and weightloss$previousdiet
X-squared = 4.8738, df = 1,
p-value = 0.02727
> #Constructing a
crosstabulation from summary data
>
priordiet<-structure(list(dieters=c(14,7), total=c(23,27)),
.names=c("dieters","total"),
row.names=c("female", "male"),
class="data.frame")
>
prop.test(priordiet$dieters, priordiet$total)
2-sample test for equality of proportions with continuity
correction
data: priordiet$dieters out of priordiet$total
X-squared = 4.8738, df = 1,
p-value = 0.02727
alternative hypothesis:
two.sided
95 percent confidence interval:
0.05013246 0.64874032
sample estimates:
prop 1
prop 2
0.6086957 0.2592593
> # scatterplot of weight pre and weight post
> plot(weightloss$weight, weightloss$weightpost)
> correlation<-cor(weightloss$weight,
weightloss$weightpost)
> correlation
[1] NA
#### solve the NA by “pairwise.complete.obs”
|
|
|
|
> #returns NA as there is missing data in the weightloss post
column,
> #use Help to look at the documentation for cor to see the
missing data options
> correlation<-cor(weightloss$weight,
weightloss$weightpost, use='pairwise.complete.obs')
> correlation
[1] 0.9014941
|
|
> #The cor function only
gives the correlation coefficient,
> #if we wish to know the P
value and/or CI associated with this coefficient we use the cor.test function
>
correlation2<-cor.test(weightloss$weight, weightloss$weightpost)
> correlation2
Pearson's product-moment correlation
data: weightloss$weight and weightloss$weightpost
t =
14.431, df = 48, p-value < 2.2e-16
alternative hypothesis: true
correlation is not equal to 0
95 percent confidence
interval:
0.8318921 0.9431726
sample estimates:
cor
0.9014941
### Regression Model
>
regression<-lm(weightloss$weight~weightloss$weightpost)
> regression
Call:
lm(formula =
weightloss$weight ~ weightloss$weightpost)
Coefficients:
(Intercept) weightloss$weightpost
20.1659 0.8316
> # additional commands for
P values associated with the regression
>
> #note that plot can have
several different meanings depending on what is being plotted.
>
> #adding gender
>
regressionmv2<-lm(weightloss$weightpost~weightloss$weight+weightloss$previousdiet+weightloss$gender)
> regressionmv2
Call:
lm(formula =
weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +
weightloss$gender)
Coefficients:
(Intercept) weightloss$weight weightloss$previousdietyes weightloss$gendermale
0.7147 0.9053 0.4475 2.7671
> # use summary to obtain
the P values for the coefficients
> summary(regressionmv2)
Call:
lm(formula =
weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +
weightloss$gender)
Residuals:
Min
1Q Median 3Q
Max
-11.1188 -2.1084
0.0074 1.6764 7.1092
Coefficients:
Estimate Std.
Error t value Pr(>|t|)
(Intercept) 0.71474 6.29633
0.114 0.9101
weightloss$weight 0.90530 0.07309
12.386 2.96e-16 ***
weightloss$previousdietyes 0.44753
1.14830 0.390 0.6985
weightloss$gendermale 2.76708 1.25636
2.202 0.0327 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘
’ 1
Residual standard error: 3.73
on 46 degrees of freedom
(82 observations deleted due to
missingness)
Multiple R-squared: 0.8312, Adjusted
R-squared: 0.8202
F-statistic: 75.5 on 3 and 46 DF, p-value: < 2.2e-16
> confint(regressionmv2)
2.5 % 97.5 %
(Intercept) -11.9591136 13.388597
weightloss$weight 0.7581759 1.052424
weightloss$previousdietyes -1.8638849
2.758949
weightloss$gendermale 0.2381667 5.296003
> #regression diagnostic
plots
> plot(regressionmv2)
Hit <Return> to see next
plot:
Hit <Return> to see next
plot:
Hit <Return> to see next
plot:
Hit <Return> to see next
plot:
|
|
|
>
> #Transforming a variable
and saving and dataframe
>
logweight<-log(weightloss$weight)
>
weightloss2<-cbind(weightloss, logweight)
> write.csv(weightloss2,
"weightloss2.csv")
> plot(weightloss2$logweight,
weightloss2$weightpost)
#### Plot examples
> # Basic plots
> x<-1:100 # generate a vector sequence of numbers from 1
to 100
> y<-x^2 # generate a vector of values of vector x squared
> plot(x,y) # draw a scatterplot of x and y
> plot(x,y, type='l') # change the default type of the plot to
a line
> plot(x,y, type='l',main="Wealth increase with
age", xlab='age',ylab='wealth (in 1000s)', col='blue', lwd=4, lty=2)
> # here we have plotted the same plot adding labels, changing
the colour (col) to blue,
> # the line width (lwd) to 4 (the default is 1)
> # and the line type (lty) to 2 (1=solid, 2=dashed, 3=dotted,
4=dotdash, 5=longdash, 6=two dash.
> plot(x,y, type='l') # change the default type of the plot to
a line
|
|
|
#### ggplot2
> library("ggplot2",
lib.loc="~/R/win-library/3.3")
Stackoverflow is a great place to get help:
http://stackoverflow.com/tags/ggplot2.
Warning message:
package ‘ggplot2’ was built under R version 3.3.3
>
>
> data()
|
|
|
|
> head(diamonds) the diamonds datasets
# A tibble: 6 x 10
carat cut color clarity depth table price x
y z
<dbl>
<ord> <ord>
<ord> <dbl> <dbl>
<int> <dbl> <dbl> <dbl>
1 0.230 Ideal E
SI2 61.5 55.0
326 3.95 3.98
2.43
2 0.210 Premium E
SI1 59.8 61.0
326 3.89 3.84
2.31
3 0.230 Good E VS1
56.9 65.0 327
4.05 4.07 2.31
4 0.290 Premium I VS2
62.4 58.0 334
4.20 4.23 2.63
5 0.310 Good J SI2
63.3 58.0 335
4.34 4.35 2.75
6 0.240 Very Good J
VVS2 62.8 57.0
336 3.94 3.96
2.48
> #make a plot using base
> plot(x=diamonds$carat, y=diamonds$price, type="p")
> title(main="base scatterplot")
|
|
> #make a "quick"
plot using ggplot2
> qplot(x=carat, y=price,
data=diamonds, geom="point") +
+ ggtitle("ggplot
scatterplot")
|
#### to show colourful diagram using ggplot2
> # Advantages of ggplot
> # color by a continuous
variable
> # First we construct our
basic plot which we will call diamondplot, then we will add the colour options
> diamondplot <-
ggplot(data = diamonds, aes(x = carat, y = price))
> diamondplot +
geom_point(aes(colour = depth))
> # color by a factor
variable
> diamondplot +
geom_point(aes(colour = color))