知思辨行

1) Install packages

- Pixmap

- Tidyverse

- Car

- rmarkdown

2) CRAN: r repositories and packages to be downloaded using CURL

3) Assign value in R, the variable is case-sensitive

> four<-2+2

> four

[1] 4

> four

[1] 4

4) Import data from a .csv file

5) Using RStudio

### this is generated code from R Studio

> library(readr)

Warning message:

package ‘readr’ was built under R version 3.3.3

> weightloss <- read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")

Parsed with column specification:

cols(

  weight = col_double(),

  weightpost = col_double(),

  gender = col_character(),

  group = col_character(),

  previousdiet = col_character()

> View(weightloss)

### to view a summary of the dataset

> summary(weightloss)

     weight         weightpost        gender             group           previousdiet

 Min.   : 65.74   Min.   : 63.07   Length:132         Length:132         Length:132

 1st Qu.: 80.33   1st Qu.: 78.29   Class :character   Class :character   Class :character

 Median : 84.28   Median : 82.55   Mode  :character   Mode  :character   Mode  :character

 Mean   : 86.32   Mean   : 83.58

 3rd Qu.: 93.03   3rd Qu.: 88.02

 Max.   :110.46   Max.   :104.10

                  NA's   :82

> hist(weightloss$weight)

> mean_weightloss

[1] 86.31773

> boxplot(weightloss$weight)

> hist(weightloss$weightpost)

#### Table command

> table(weightloss$previousdiet)

 no yes

 29  21

### Vector, Matrix , Dataframe, List

# c is a combined function, means putting the items into a colum

> a <- c(7.1,-8,3.9,4,9,5) # numeric vector

> b <- c("green","blue","red") # character vector

> c <- c(TRUE,FALSE,TRUE,FALSE,TRUE,FALSE) #logical vector

> #if we use by row=FALSE the matrix is filled by column

> matrix_2 = matrix(

+   c(6, 2, 5, 4, 9, 7), # the data elements

+   nrow=2,              # number of rows

+   ncol=3,              # number of columns

+   byrow = FALSE)       # fill matrix by rows

> matrix_2                # print the matrix

> #Data Frames

> Column_1 <-c(2, 5.5, 6, 1.3, 10, 4)

> Column_2 <-c(7, 3, 2.9, 4.5, 9.4, 1)

> Dataframe_1<-data.frame(Column_1, Column_2)

> Dataframe_1

### Subsetting: indexing and values

### Fairly slow on large computing

> # using vector a above we can select the 3rd element

> a[3]

[1] 3.9

> # select all elements except the 3rd element

> a[-3]

[1]  7.1 -8.0  4.0  9.0  5.0

> # select all elements whose values are < 4

> a[a<4]

[1] -8.0  3.9

> # select values equal to 9

> a[a==9]

[1] 9

> # select values greater than 2 & less than or equal to 5

> a[a>2 & a<=5]

[1] 3.9 4.0 5.0

> # select values of a less than or equal to 0 or greater than 5

> a[a<=0 | a>5]

[1]  7.1 -8.0  9.0

> # select elements 1 to 3 using the sequence operator :

> a[1:3]

[1]  7.1 -8.0  3.9

> # select elements 3 and 5

> a[c(3,5)]

[1] 3.9 9.0

> # check the class of a

> class(a)

[1] "numeric"

> #You can access the elements of a data frame using the number of the row and column entries.

> #The row is first and the column is second, calls are made using square brackets.

> #For example to call the element in the 3rd row of the 2nd column we use the following command

> Dataframe_1[3,2]

[1] 2.9

> Dataframe_1

  Column_1 Column_2

1      2.0      7.0

2      5.5      3.0

3      6.0      2.9

4      1.3      4.5

5     10.0      9.4

6      4.0      1.0

### Tidyverse for data manipulation

### Book: << R for Data Science>> http://r4ds.had.co.nz/

#### Tidyverse & tibble

> # Import weightloss as a "tibbles" instead of a "dataframes"

> weightlosst <-read_csv("C:/Users/cw811/Desktop/Intro_to_R/weightloss.csv")

> View(weightlosst)

> weightlosst

> select (weightlosst, weight, gender, group)

# A tibble: 132 x 3

weight gender group

1 76.2 female participated

2 77.9 female participated

3 78.8 female participated

4 78.9 female participated

5 79.0 female participated

6 79.0 female participated

7 79.3 female participated

8 81.0 male participated

9 82.2 male participated

10 83.1 male participated

# ... with 122 more rows

> filter(weightlosst, group=="participated")

# A tibble: 50 x 5

weight weightpost gender group previousdiet

1 76.2 76.6 female participated yes

2 77.9 63.1 female participated yes

3 78.8 66.6 female participated yes

4 78.9 69.3 female participated no

5 79.0 77.2 female participated yes

6 79.0 69.8 female participated no

7 79.3 71.8 female participated yes

8 81.0 77.9 male participated no

9 82.2 82.2 male participated no

10 83.1 82.1 male participated yes

# ... with 40 more rows

> weightlossint1 <- filter(weightlosst, weight>80)

> weightlossint2 <- select(weightlosst, weight, gender, group)

> View(weightlossint2)

> # 2: nesting

> weightlossint <- select(filter(weightlosst, weight>80), weight, gender, group)

> View(weightlossint)

> # 3: Pipes – simplify the intermediate steps and variable savings

> weightloss_sml <- weightlosst %>%

+   filter(weight > 80) %>%

+   select(weight, gender, group)

> View(weightloss_sml)

### mutate tibbles

> # We can use pipes to create new variables using existing variables with mutate

> weightlosst %>%

+ mutate(weightg =weight*1000)

# A tibble: 132 x 6

weight weightpost gender group previousdiet weightg

1 76.2 76.6 female participated yes 76220

2 77.9 63.1 female participated yes 77910

3 78.8 66.6 female participated yes 78810

4 78.9 69.3 female participated no 78890

5 79.0 77.2 female participated yes 78950

6 79.0 69.8 female participated no 78960

7 79.3 71.8 female participated yes 79310

8 81.0 77.9 male participated no 80970

9 82.2 82.2 male participated no 82180

10 83.1 82.1 male participated yes 83110

# ... with 122 more rows

### using group by the null values can be found eg.

> # Often we would want to split the data into groups and apply an analysis to the groups, we can use group_by to do this

> weightlosst %>%

+   group_by(gender) %>%

+   summarize(weight_av =mean(weight))

# A tibble: 3 x 2

  gender weight_av

  <chr>      <dbl>

1 female      85.9

2 male        92.9

3 <NA>        84.3

### Markdown and Statistical Tests

### Normality tests

> shapiro.test(weightloss$weight)

Shapiro-Wilk normality test

data: weightloss$weight

W = 0.98416, p-value = 0.1291

> shapiro.test(weightloss$weightpost)

Shapiro-Wilk normality test

data: weightloss$weightpost

W = 0.96941, p-value = 0.219

### Paired T tests

> t.test(weightloss$weight, weightloss$weightpost, paired=T)

Paired t-test

data: weightloss$weight and weightloss$weightpost

t = 11.309, df = 49, p-value = 2.903e-15

alternative hypothesis: true difference in means is not equal to 0

95 percent confidence interval:

5.012088 7.178312

sample estimates:

mean of the differences

6.0952

### One sided T tests in R

> #one sided, one sample t test where the standard/population mean is 110

> t.test(weightloss$weightpost, mu=110, alternative="less")

One Sample t-test

data: weightloss$weightpost

t = -21.241, df = 49, p-value < 2.2e-16

alternative hypothesis: true mean is less than 110

95 percent confidence interval:

-Inf 85.66403

sample estimates:

mean of x

83.5786

### 正态分布的相关检验

> library(car)

> leveneTest(weight~gender, data=weightloss)

Levene's Test for Homogeneity of Variance (center = median)

      Df F value Pr(>F)

group  1  0.2166 0.6438

Warning message:

In leveneTest.default(y = y, group = group, ...) : group coerced to factor.

> #R tells us that gender is not recognised as a factor however has analysed the data as if it is.

> class(weightloss$gender)

[1] "character"

### if the Normality of the data is better, then there is Bartlett test

> #Bartlett's test

> bartlett.test(weight~Fgender, data=weightloss)

        Bartlett test of homogeneity of variances

data:  weight by Fgender

Bartlett's K-squared = 0.016271, df = 1, p-value = 0.8985

### independent sample t test unequal/equal variances

> t.test(weightloss$weight ~ weightloss$gender)

Welch Two Sample t-test

data: weightloss$weight by weightloss$gender

t = -3.2986, df = 47.109, p-value = 0.001855

alternative hypothesis: true difference in means is not equal to 0

95 percent confidence interval:

-11.133708 -2.698401

sample estimates:

mean in group female mean in group male

85.93913 92.85519

> # independent sample t test equal variances

> t.test(weightloss$weight ~ weightloss$gender, var=TRUE)

Two Sample t-test

data: weightloss$weight by weightloss$gender

t = -3.2915, df = 48, p-value = 0.001874

alternative hypothesis: true difference in means is not equal to 0

95 percent confidence interval:

-11.140752 -2.691357

sample estimates:

mean in group female mean in group male

85.93913 92.85519

#### ANOVA

#### ~ means “by”

#### using summary to get the values

> table(weightloss$group)

    eligible not eligible participated

          41           41           50

> boxplot(weightloss$weight~ weightloss$group)

> # check homogeneity of variance assumption using the Levene's test the CAR library

> library(car)

> #Change group to factor

> Fgroup<-factor(weightloss$group)

> class(Fgroup)

[1] "factor"

> leveneTest(weight~Fgroup, data=weightloss)

Levene's Test for Homogeneity of Variance (center = median)

       Df F value Pr(>F)

group   2  0.5807  0.561

> aovgroup<-aov(weightloss$weight~weightloss$group)

> summary(aovgroup)

                  Df Sum Sq Mean Sq F value   Pr(>F)

weightloss$group   2   1212   606.2   9.017 0.000216 ***

Residuals        129   8673    67.2

---

Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

> # Post Hoc test to determine which groups differ

> TukeyHSD(aovgroup)

Tukey multiple comparisons of means

95% family-wise confidence level

Fit: aov(formula = weightloss$weight ~ weightloss$group)

$`weightloss$group`

diff lwr upr p adj

not eligible-eligible 3.863171 -0.4307754 8.157117 0.0871709

participated-eligible 7.334044 3.2378804 11.430207 0.0001221

participated-not eligible 3.470873 -0.6252903 7.567037 0.1140809

> # Non parametric alternative

> #the kruskal-wallis test is the non parametric alternative to ANOVA. Note need to use group as factor.

> Fgroup<-as.factor(weightloss$group)

> kruskal.test(weightloss$weight~Fgroup)

Kruskal-Wallis rank sum test

data: weightloss$weight by Fgroup

Kruskal-Wallis chi-squared = 15.772, df = 2, p-value = 0.000376

# Crosstabulations and the Chi Square test

> #Crosstabulations

> table(weightloss$gender, weightloss$previousdiet)

no yes

female 9 14

male 20 7

> chisq.test(weightloss$gender, weightloss$previousdiet)

Pearson's Chi-squared test with Yates' continuity correction

data: weightloss$gender and weightloss$previousdiet

X-squared = 4.8738, df = 1, p-value = 0.02727

> #Constructing a crosstabulation from summary data

> priordiet<-structure(list(dieters=c(14,7), total=c(23,27)), .names=c("dieters","total"), row.names=c("female", "male"), class="data.frame")

> prop.test(priordiet$dieters, priordiet$total)

2-sample test for equality of proportions with continuity correction

data: priordiet$dieters out of priordiet$total

X-squared = 4.8738, df = 1, p-value = 0.02727

alternative hypothesis: two.sided

95 percent confidence interval:

0.05013246 0.64874032

sample estimates:

prop 1 prop 2

0.6086957 0.2592593

> # scatterplot of weight pre and weight post

> plot(weightloss$weight, weightloss$weightpost)

> correlation<-cor(weightloss$weight, weightloss$weightpost)

> correlation

[1] NA

#### solve the NA by “pairwise.complete.obs”

> #returns NA as there is missing data in the weightloss post column,

> #use Help to look at the documentation for cor to see the missing data options

> correlation<-cor(weightloss$weight, weightloss$weightpost, use='pairwise.complete.obs')

> correlation

[1] 0.9014941

> #The cor function only gives the correlation coefficient,

> #if we wish to know the P value and/or CI associated with this coefficient we use the cor.test function

> correlation2<-cor.test(weightloss$weight, weightloss$weightpost)

> correlation2

Pearson's product-moment correlation

data: weightloss$weight and weightloss$weightpost

t = 14.431, df = 48, p-value < 2.2e-16

alternative hypothesis: true correlation is not equal to 0

95 percent confidence interval:

0.8318921 0.9431726

sample estimates:

cor

0.9014941

### Regression Model

> regression<-lm(weightloss$weight~weightloss$weightpost)

> regression

Call:

lm(formula = weightloss$weight ~ weightloss$weightpost)

Coefficients:

(Intercept) weightloss$weightpost

20.1659 0.8316

> # additional commands for P values associated with the regression

> #note that plot can have several different meanings depending on what is being plotted.

> #adding gender

> regressionmv2<-lm(weightloss$weightpost~weightloss$weight+weightloss$previousdiet+weightloss$gender)

> regressionmv2

Call:

lm(formula = weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +

weightloss$gender)

Coefficients:

(Intercept) weightloss$weight weightloss$previousdietyes weightloss$gendermale

0.7147 0.9053 0.4475 2.7671

> # use summary to obtain the P values for the coefficients

> summary(regressionmv2)

Call:

lm(formula = weightloss$weightpost ~ weightloss$weight + weightloss$previousdiet +

weightloss$gender)

Residuals:

Min 1Q Median 3Q Max

-11.1188 -2.1084 0.0074 1.6764 7.1092

Coefficients:

Estimate Std. Error t value Pr(>|t|)

(Intercept) 0.71474 6.29633 0.114 0.9101

weightloss$weight 0.90530 0.07309 12.386 2.96e-16 ***

weightloss$previousdietyes 0.44753 1.14830 0.390 0.6985

weightloss$gendermale 2.76708 1.25636 2.202 0.0327 *

---

Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 3.73 on 46 degrees of freedom

(82 observations deleted due to missingness)

Multiple R-squared: 0.8312, Adjusted R-squared: 0.8202

F-statistic: 75.5 on 3 and 46 DF, p-value: < 2.2e-16

> confint(regressionmv2)

2.5 % 97.5 %

(Intercept) -11.9591136 13.388597

weightloss$weight 0.7581759 1.052424

weightloss$previousdietyes -1.8638849 2.758949

weightloss$gendermale 0.2381667 5.296003

> #regression diagnostic plots

> plot(regressionmv2)

Hit <Return> to see next plot:

> #Transforming a variable and saving and dataframe

> logweight<-log(weightloss$weight)

> weightloss2<-cbind(weightloss, logweight)

> write.csv(weightloss2, "weightloss2.csv")

> plot(weightloss2$logweight, weightloss2$weightpost)

#### Plot examples

> # Basic plots

> x<-1:100 # generate a vector sequence of numbers from 1 to 100

> y<-x^2 # generate a vector of values of vector x squared

> plot(x,y) # draw a scatterplot of x and y

> plot(x,y, type='l') # change the default type of the plot to a line

> plot(x,y, type='l',main="Wealth increase with age", xlab='age',ylab='wealth (in 1000s)', col='blue', lwd=4, lty=2)

> # here we have plotted the same plot adding labels, changing the colour (col) to blue,

> # the line width (lwd) to 4 (the default is 1)

> # and the line type (lty) to 2 (1=solid, 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=two dash.

> plot(x,y, type='l') # change the default type of the plot to a line

#### ggplot2

> library("ggplot2", lib.loc="~/R/win-library/3.3")

Stackoverflow is a great place to get help: http://stackoverflow.com/tags/ggplot2.

Warning message:

package ‘ggplot2’ was built under R version 3.3.3

> data()

> head(diamonds) the diamonds datasets

# A tibble: 6 x 10

carat cut color clarity depth table price x y z

1 0.230 Ideal E SI2 61.5 55.0 326 3.95 3.98 2.43

2 0.210 Premium E SI1 59.8 61.0 326 3.89 3.84 2.31

3 0.230 Good E VS1 56.9 65.0 327 4.05 4.07 2.31

4 0.290 Premium I VS2 62.4 58.0 334 4.20 4.23 2.63

5 0.310 Good J SI2 63.3 58.0 335 4.34 4.35 2.75

6 0.240 Very Good J VVS2 62.8 57.0 336 3.94 3.96 2.48

> #make a plot using base

> plot(x=diamonds$carat, y=diamonds$price, type="p")

> title(main="base scatterplot")

> #make a "quick" plot using ggplot2

> qplot(x=carat, y=price, data=diamonds, geom="point") +

+ ggtitle("ggplot scatterplot")

#### to show colourful diagram using ggplot2

> # Advantages of ggplot

> # color by a continuous variable

> # First we construct our basic plot which we will call diamondplot, then we will add the colour options

> diamondplot <- ggplot(data = diamonds, aes(x = carat, y = price))

> diamondplot + geom_point(aes(colour = depth))

> # color by a factor variable

> diamondplot + geom_point(aes(colour = color))

知思辨行

2018年2月20日星期二

Intro_To_R & RStudio