corporate@oreilly.com

$ sudo apt-get install r-base
$ sudo yum install R.i386
$ sudo apt-get install r-base-html r-doc-html
1 + 1
#> [1] 2
max(1, 3, 5)
#> [1] 5
max(
  1, 3,
  +5
)
#> [1] 5
q()
help.start()
help(functionname)
args(functionname)
example(functionname)
help(mean)
?mean
args(mean)
#> function (x, ...)
#> NULL
args(sd)
#> function (x, na.rm = FALSE)
#> NULL
example(mean)
#>
#> mean> x <- c(0:10, 50)
#>
#> mean> xm <- mean(x)
#>
#> mean> c(xm, mean(x, trim = 0.10))
#> [1] 8.75 5.50
help.search("pattern")
> ??pattern
help(adf.test)
#> No documentation for 'adf.test' in specified packages and libraries:
#> you could try '??adf.test'
help.search("adf.test")
Help files with alias or concept or title matching 'adf.test' using
regular expression matching:

tseries::adf.test       Augmented Dickey-Fuller Test

Type '?PKG::FOO' to inspect entry 'PKG::FOO TITLE'.
help(adf.test, package = "tseries")
?tseries::adf.test
help.search("dickey-fuller")
Help files with alias or concept or title matching 'dickey-fuller' using
fuzzy matching:

fUnitRoots::DickeyFullerPValues
                         Dickey-Fuller p Values
tseries::adf.test        Augmented Dickey-Fuller Test
urca::ur.df              Augmented-Dickey-Fuller Unit Root Test

Type '?PKG::FOO' to inspect entry 'PKG::FOO TITLE'.
help(package = "packagename")
help(package = "tseries")
vignette()
vignette(package = "packagename")
vignette("vignettename")
RSiteSearch("key phrase")
RSiteSearch("canonical correlation")
set.seed(42)
n <- 4
example_df <- data.frame(
  some_reals = rnorm(n),
  some_letters = sample(LETTERS, n, replace = TRUE),
  some_ints = sample(1:10, n, replace = TRUE)
)
example_df
#>   some_reals some_letters some_ints
#> 1      1.371            R        10
#> 2     -0.565            S         3
#> 3      0.363            L         5
#> 4      0.633            S        10
data(mtcars)
head(mtcars)
#>                    mpg cyl disp  hp drat   wt qsec vs am gear carb
#> Mazda RX4         21.0   6  160 110 3.90 2.62 16.5  0  1    4    4
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.88 17.0  0  1    4    4
#> Datsun 710        22.8   4  108  93 3.85 2.32 18.6  1  1    4    1
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.21 19.4  1  0    3    1
#> Hornet Sportabout 18.7   8  360 175 3.15 3.44 17.0  0  0    3    2
#> Valiant           18.1   6  225 105 2.76 3.46 20.2  1  0    3    1
dput(head(mtcars, 2))
#> structure(list(mpg = c(21, 21), cyl = c(6, 6), disp = c(160,
#> 160), hp = c(110, 110), drat = c(3.9, 3.9), wt = c(2.62, 2.875
#> ), qsec = c(16.46, 17.02), vs = c(0, 0), am = c(1, 1), gear = c(4,
#> 4), carb = c(4, 4)), row.names = c("Mazda RX4", "Mazda RX4 Wag"
#> ), class = "data.frame")
example_df <- structure(list(mpg = c(21, 21), cyl = c(6, 6), disp = c(160,
160), hp = c(110, 110), drat = c(3.9, 3.9), wt = c(2.62, 2.875
), qsec = c(16.46, 17.02), vs = c(0, 0), am = c(1, 1), gear = c(4,
4), carb = c(4, 4)), row.names = c("Mazda RX4", "Mazda RX4 Wag"
), class = "data.frame")

example_df
#>               mpg cyl disp  hp drat   wt qsec vs am gear carb
#> Mazda RX4      21   6  160 110  3.9 2.62 16.5  0  1    4    4
#> Mazda RX4 Wag  21   6  160 110  3.9 2.88 17.0  0  1    4    4
pi
#> [1] 3.14
sqrt(2)
#> [1] 1.41
print(pi)
#> [1] 3.14
print(sqrt(2))
#> [1] 1.41
print(matrix(c(1, 2, 3, 4), 2, 2))
#>      [,1] [,2]
#> [1,]    1    3
#> [2,]    2    4
print(list("a", "b", "c"))
#> [[1]]
#> [1] "a"
#>
#> [[2]]
#> [1] "b"
#>
#> [[3]]
#> [1] "c"
print("The zero occurs at", 2 * pi, "radians.")
#> Error in print.default("The zero occurs at", 2 * pi, "radians."): invalid 'quote' argument
print("The zero occurs at")
#> [1] "The zero occurs at"
print(2 * pi)
#> [1] 6.28
print("radians")
#> [1] "radians"
cat("The zero occurs at", 2 * pi, "radians.", "\n")
#> The zero occurs at 6.28 radians.
fib <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
cat("The first few Fibonacci numbers are:", fib, "...\n")
#> The first few Fibonacci numbers are: 0 1 1 2 3 5 8 13 21 34 ...
cat(list("a", "b", "c"))
#> Error in cat(list("a", "b", "c")): argument 1 (type 'list') cannot be handled by 'cat'
x <- 3
x <- 3
y <- 4
z <- sqrt(x^2 + y^2)
print(z)
#> [1] 5
x <- 3
print(x)
#> [1] 3

x <- c("fee", "fie", "foe", "fum")
print(x)
#> [1] "fee" "fie" "foe" "fum"
x <<- 3
foo <- 3
print(foo)
#> [1] 3
5 -> fum
print(fum)
#> [1] 5
library(tidyverse)

mpg %>%
  head %>%
  print
#> # A tibble: 6 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~
#> 2 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~
#> 3 audi         a4      2    2008     4 manu~ f        20    31 p     comp~
#> 4 audi         a4      2    2008     4 auto~ f        21    30 p     comp~
#> 5 audi         a4      2.8  1999     6 auto~ f        16    26 p     comp~
#> 6 audi         a4      2.8  1999     6 manu~ f        18    26 p     comp~
print(head(mpg))
#> # A tibble: 6 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~
#> 2 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~
#> 3 audi         a4      2    2008     4 manu~ f        20    31 p     comp~
#> 4 audi         a4      2    2008     4 auto~ f        21    30 p     comp~
#> 5 audi         a4      2.8  1999     6 auto~ f        16    26 p     comp~
#> 6 audi         a4      2.8  1999     6 manu~ f        18    26 p     comp~
x %>% head

head(x)
mpg %>%
  head %>%
  print

has the same effect as this code which use an intermediate variable.
x <- head(mpg)
print(x)
print(head(mpg))
iris %>% head(10)

head(iris, 10)
10 %>% head(x, .)

head(x, 10)
x <- 10
y <- 50
z <- c("three", "blind", "mice")
f <- function(n, p) sqrt(p * (1 - p) / n)
ls()
#> [1] "f" "x" "y" "z"
ls()
#> character(0)
x <- 10
y <- 50
z <- c("three", "blind", "mice")
f <- function(n, p) sqrt(p * (1 - p) / n)
ls.str()
#> f : function (n, p)
#> x :  num 10
#> y :  num 50
#> z :  chr [1:3] "three" "blind" "mice"
ls()
#> [1] "f" "x" "y" "z"
ls(all.names = TRUE)
#> [1] ".Random.seed" "f"            "x"            "y"
#> [5] "z"
x <- 2 * pi
x
#> [1] 6.28
rm(x)
x
#> Error in eval(expr, envir, enclos): object 'x' not found
rm(x, y, z)
ls()
#> [1] "f" "x" "y" "z"
rm(list = ls())
ls()
#> character(0)
c(1, 1, 2, 3, 5, 8, 13, 21)
#> [1]  1  1  2  3  5  8 13 21
c(1 * pi, 2 * pi, 3 * pi, 4 * pi)
#> [1]  3.14  6.28  9.42 12.57
c("My", "twitter", "handle", "is", "@cmastication")
#> [1] "My"            "twitter"       "handle"        "is"
#> [5] "@cmastication"
c(TRUE, TRUE, FALSE, TRUE)
#> [1]  TRUE  TRUE FALSE  TRUE
v1 <- c(1, 2, 3)
v2 <- c(4, 5, 6)
c(v1, v2)
#> [1] 1 2 3 4 5 6
v1 <- c(1, 2, 3)
v3 <- c("A", "B", "C")
c(v1, v3)
#> [1] "1" "2" "3" "A" "B" "C"
mode(3.1415)
#> [1] "numeric"
mode("foo")
#> [1] "character"
c(3.1415, "foo")
#> [1] "3.1415" "foo"
mode(c(3.1415, "foo"))
#> [1] "character"
x <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
mean(x)
#> [1] 8.8
median(x)
#> [1] 4
sd(x)
#> [1] 11
var(x)
#> [1] 122
x <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
y <- log(x + 1)
cor(x, y)
#> [1] 0.907
cov(x, y)
#> [1] 11.5
x <- c(0, 1, 1, 2, 3, NA)
mean(x)
#> [1] NA
sd(x)
#> [1] NA
x <- c(0, 1, 1, 2, 3, NA)
sd(x, na.rm = TRUE)
#> [1] 1.14
data(cars)

map_dbl(cars, mean)
#> speed  dist
#>  15.4  43.0
map_dbl(cars, sd)
#> speed  dist
#>  5.29 25.77
map_dbl(cars, median)
#> speed  dist
#>    15    36
var(cars)
#>       speed dist
#> speed    28  110
#> dist    110  664
cor(cars)
#>       speed  dist
#> speed 1.000 0.807
#> dist  0.807 1.000
cov(cars)
#>       speed dist
#> speed    28  110
#> dist    110  664
1:5
#> [1] 1 2 3 4 5
seq(from = 1, to = 5, by = 2)
#> [1] 1 3 5
rep(1, times = 5)
#> [1] 1 1 1 1 1
0:9
#>  [1] 0 1 2 3 4 5 6 7 8 9
10:19
#>  [1] 10 11 12 13 14 15 16 17 18 19
9:0
#>  [1] 9 8 7 6 5 4 3 2 1 0
10:20 %>% mean()
seq(from = 0, to = 20)
#>  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
seq(from = 0, to = 20, by = 2)
#>  [1]  0  2  4  6  8 10 12 14 16 18 20
seq(from = 0, to = 20, by = 5)
#> [1]  0  5 10 15 20
seq(from = 0, to = 20, length.out = 5)
#> [1]  0  5 10 15 20
seq(from = 0, to = 100, length.out = 5)
#> [1]   0  25  50  75 100
seq(from = 1.0, to = 2.0, length.out = 5)
#> [1] 1.00 1.25 1.50 1.75 2.00
rep(pi, times = 5)
#> [1] 3.14 3.14 3.14 3.14 3.14
a <- 3
a == pi # Test for equality
#> [1] FALSE
a != pi # Test for inequality
#> [1] TRUE
a < pi
#> [1] TRUE
a > pi
#> [1] FALSE
a <= pi
#> [1] TRUE
a >= pi
#> [1] FALSE
v <- c(3, pi, 4)
w <- c(pi, pi, pi)
v == w # Compare two 3-element vectors
#> [1] FALSE  TRUE FALSE
v != w
#> [1]  TRUE FALSE  TRUE
v < w
#> [1]  TRUE FALSE FALSE
v <= w
#> [1]  TRUE  TRUE FALSE
v > w
#> [1] FALSE FALSE  TRUE
v >= w
#> [1] FALSE  TRUE  TRUE
v <- c(3, pi, 4)
v == pi # Compare a 3-element vector against one number
#> [1] FALSE  TRUE FALSE
v != pi
#> [1]  TRUE FALSE  TRUE
v <- c(3, pi, 4)
any(v == pi) # Return TRUE if any element of v equals pi
#> [1] TRUE
all(v == 0) # Return TRUE if all elements of v are zero
#> [1] FALSE
fib <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
fib
#>  [1]  0  1  1  2  3  5  8 13 21 34
fib[1]
#> [1] 0
fib[2]
#> [1] 1
fib[3]
#> [1] 1
fib[4]
#> [1] 2
fib[5]
#> [1] 3
fib[1:3] # Select elements 1 through 3
#> [1] 0 1 1
fib[4:9] # Select elements 4 through 9
#> [1]  2  3  5  8 13 21
fib[c(1, 2, 4, 8)]
#> [1]  0  1  2 13
fib[-1] # Ignore first element
#> [1]  1  1  2  3  5  8 13 21 34
fib[1:3] # As before
#> [1] 0 1 1
fib[-(1:3)] # Invert sign of index to exclude instead of select
#> [1]  2  3  5  8 13 21 34
fib < 10 # This vector is TRUE wherever fib is less than 10
#>  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
fib[fib < 10] # Use that vector to select elements less than 10
#> [1] 0 1 1 2 3 5 8
fib %% 2 == 0 # This vector is TRUE wherever fib is even
#>  [1]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE
fib[fib %% 2 == 0] # Use that vector to select the even elements
#> [1]  0  2  8 34
v <- c(3, 6, 1, 9, 11, 16, 0, 3, 1, 45, 2, 8, 9, 6, -4)
v[ v > median(v)]
#> [1]  9 11 16 45  8  9
v[ (v < quantile(v, 0.05)) | (v > quantile(v, 0.95)) ]
#> [1] 45 -4
v[ abs(v - mean(v)) > sd(v)]
#> [1] 45 -4
v <- c(1, 2, 3, NA, 5)
v[!is.na(v) & !is.null(v)]
#> [1] 1 2 3 5
years <- c(1960, 1964, 1976, 1994)
names(years) <- c("Kennedy", "Johnson", "Carter", "Clinton")
years
#> Kennedy Johnson  Carter Clinton
#>    1960    1964    1976    1994
years["Carter"]
#> Carter
#>   1976
years["Clinton"]
#> Clinton
#>    1994
years[c("Carter", "Clinton")]
#>  Carter Clinton
#>    1976    1994
v <- c(11, 12, 13, 14, 15)
w <- c(1, 2, 3, 4, 5)
v + w
#> [1] 12 14 16 18 20
v - w
#> [1] 10 10 10 10 10
v * w
#> [1] 11 24 39 56 75
v / w
#> [1] 11.00  6.00  4.33  3.50  3.00
w^v
#> [1] 1.00e+00 4.10e+03 1.59e+06 2.68e+08 3.05e+10
w
#> [1] 1 2 3 4 5
w + 2
#> [1] 3 4 5 6 7
w - 2
#> [1] -1  0  1  2  3
w * 2
#> [1]  2  4  6  8 10
w / 2
#> [1] 0.5 1.0 1.5 2.0 2.5
2^w
#> [1]  2  4  8 16 32
w
#> [1] 1 2 3 4 5
mean(w)
#> [1] 3
w - mean(w)
#> [1] -2 -1  0  1  2
w
#> [1] 1 2 3 4 5
sd(w)
#> [1] 1.58
(w - mean(w)) / sd(w)
#> [1] -1.265 -0.632  0.000  0.632  1.265
w <- 1:5
w
#> [1] 1 2 3 4 5
sqrt(w)
#> [1] 1.00 1.41 1.73 2.00 2.24
log(w)
#> [1] 0.000 0.693 1.099 1.386 1.609
sin(w)
#> [1]  0.841  0.909  0.141 -0.757 -0.959
n <- 10
0:n - 1
#>  [1] -1  0  1  2  3  4  5  6  7  8  9
library(tidyverse)
data(mpg)

mpg %>%
  filter(cty > 21) %>%
  head(3) %>%
  print()
#> # A tibble: 3 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 chevrolet    mali~   2.4  2008     4 auto~ f        22    30 r     mids~
#> 2 honda        civic   1.6  1999     4 manu~ f        28    33 r     subc~
#> 3 honda        civic   1.6  1999     4 auto~ f        24    32 r     subc~
temp1 <- filter(mpg, cty > 21)
temp2 <- head(temp1, 3)
print(temp2)
#> # A tibble: 3 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 chevrolet    mali~   2.4  2008     4 auto~ f        22    30 r     mids~
#> 2 honda        civic   1.6  1999     4 manu~ f        28    33 r     subc~
#> 3 honda        civic   1.6  1999     4 auto~ f        24    32 r     subc~
x %>% head()
head(x)
x %>% head(n = 10)

head(x, n = 10)
library(tidyverse)

filtered_mpg <- filter(mpg, cty > 21)
selected_mpg <- select(filtered_mpg, cty, hwy)
ggplot(selected_mpg, aes(cty, hwy)) + geom_point()
ggplot(select(filter(mpg, cty > 21), cty, hwy), aes(cty, hwy)) + geom_point()
mpg %>%
  filter(cty > 21) %>%
  select(cty, hwy) %>%
  ggplot(aes(cty, hwy)) + geom_point()
iris %>% head(3)
iris %>% head(3, x = .)
ls()
ls

# > function (name, pos = -1L, envir = as.environment(pos), all.names = FALSE,
# >     pattern, sorted = TRUE)
# > {
# >     if (!missing(name)) {
# >         pos <- tryCatch(name, error = function(e) e)
# >         if (inherits(pos, "error")) {
# >             name <- substitute(name)
# >             if (!is.character(name))
# >                 name <- deparse(name)
# > etc...
tbl <- read.csv("F:\research\bio\assay.csv")
x <- pi # Set x to 3.1415926...
x < -pi # Oops! We are comparing x instead of setting it!
#> [1] FALSE
x < -pi
#> Error in eval(expr, envir, enclos): object 'x' not found
x <- 0 # Initialize x to zero
x < -pi # Oops!
#> [1] FALSE
total <- 1 + 2 + 3 + # Continued on the next line
  4 + 5
print(total)
#> [1] 15
total <- 1 + 2 + 3 # Oops! R sees a complete expression
+4 + 5 # This is a new expression; R prints its value
#> [1] 9
print(total)
#> [1] 6
v <- 1 # Assign 1 to v
v == 0 # Compare v against zero
#> [1] FALSE
v <- 0 # Assign 0 to v, overwriting previous contents
n <- 5
1:n + 1
#> [1] 2 3 4 5 6
1:(n + 1)
#> [1] 1 2 3 4 5 6
x <- rnorm(100)
n <- 5
truehist(x, n)
#> Error in truehist(x, n): could not find function "truehist"
library(MASS) # Load the MASS package into R
truehist(x, n)
Error: '\m' is an unrecognized escape in character string starting "'.\temp\m"
read_csv(`./temp/my_file.csv`)
read_csv(`.\\temp\\my_file.csv`)
getwd()
#> [1] "/Users/jal/DocumentsPersonal/R-Cookbook"
setwd("~/Documents/MyDirectory")
save.image()
history()
history(100)          # Show 100 most recent lines of history
history(Inf)          # Show entire saved history
aVeryLongRunningFunction()  # Oops! Forgot to save the result!
x <- .Last.value            # Capture the result now
search()
search()
#>  [1] ".GlobalEnv"        "package:knitr"     "package:forcats"
#>  [4] "package:stringr"   "package:dplyr"     "package:purrr"
#>  [7] "package:readr"     "package:tidyr"     "package:tibble"
#> [10] "package:ggplot2"   "package:tidyverse" "package:stats"
#> [13] "package:graphics"  "package:grDevices" "package:utils"
#> [16] "package:datasets"  "package:methods"   "Autoloads"
#> [19] "package:base"
library(packagename)
lda(x)
#> Error in lda(x): could not find function "lda"
my_model <- lda(cty ~ displ + year, data = mpg)
#> Error in lda(cty ~ displ + year, data = mpg): could not find function "lda"

library(MASS)                          # Load the MASS library into memory
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#>     select
my_model <-
  lda(cty ~ displ + year, data = mpg)  # Now R can find the function
detach(package:MASS)
data(dsname, package = "pkgname")
head(pressure)
#>   temperature pressure
#> 1           0   0.0002
#> 2          20   0.0012
#> 3          40   0.0060
#> 4          60   0.0300
#> 5          80   0.0900
#> 6         100   0.2700
help(pressure)      # Bring up help page for pressure dataset
data()              # Bring up a list of datasets
data(Cars93, package = "MASS")
data(package = "pkgname")
library()
installed.packages()[1:5, c("Package", "Version")]
#>            Package      Version
#> abind      "abind"      "1.4-5"
#> ade4       "ade4"       "1.7-13"
#> adegenet   "adegenet"   "2.1.1"
#> ape        "ape"        "5.2"
#> assertthat "assertthat" "0.2.0"
install.packages("packagename")
install.packages("packagename", lib = "~/lib/R")
install_github("thomasp85/tidygraph")
choosessmirror()
R will present a list of CRAN mirrors.
options("repos")[[1]][1]
options(repos = c(CRAN = "http://cran.rstudio.com"))
source("myScript.R")
print("Hello, World!")
source("hello.R")
#> [1] "Hello, World!"
source("hello.R", echo = TRUE)
#>
#> > print("Hello, World!")
#> [1] "Hello, World!"
R CMD BATCH scriptfile outputfile
Rscript scriptfile arg1 arg2 arg3
R CMD BATCH --quiet myScript.R results.out
Rscript myScript.R arg1 arg2 arg3
argv <- commandArgs(TRUE)
Rscript --slave myScript.R arg1 arg2 arg3 >results.out
argv <- commandArgs(TRUE)
x <- as.numeric(argv[1])
y <- as.numeric(argv[2])

cat("x =", x, "\n")
cat("y =", y, "\n")
cat("x + y = ", x + y, "\n")
cat("x - y = ", x - y, "\n")
cat("x * y = ", x * y, "\n")
cat("x / y = ", x / y, "\n")
Rscript arith.R 2 3.1415
x = 2
y = 3.1415
x + y = 5.1415
x - y = -1.1415
x * y = 6.283
x / y = 0.6366385
#!/usr/bin/Rscript --slave

argv <- commandArgs(TRUE)
x <- as.numeric(argv[1])
.
. (etc.)
.
chmod +x arith.R
arith.R 2 3.1415
Sys.getenv("R_HOME")
#> [1] "/Library/Frameworks/R.framework/Resources"
> Sys.getenv("R_HOME")
[1] "/Library/Frameworks/R.framework/Resources"
> Sys.getenv("R_HOME")
[1] "/usr/lib/R"
R RHOME
# /usr/lib/R
Sys.setenv(DB_USERID = "my_id")
Sys.setenv(DB_PASSWORD = "My_Password!")
options(prompt = "R> ")
source("~/.Rprofile")
#
# ... remainder of local .Rprofile...
#
help(options)
scores <- c(61, 66, 90, 88, 100)
points <- data.frame(
  label = c("Low", "Mid", "High"),
  lbound = c(0, 0.67,   1.64),
  ubound = c(0.67, 1.64,   2.33)
)
pi
#> [1] 3.14
100 * pi
#> [1] 314
print(pi, digits = 4)
#> [1] 3.142
print(100 * pi, digits = 4)
#> [1] 314.2
cat(pi, "\n")
#> 3.14
cat(format(pi, digits = 4), "\n")
#> 3.142
pnorm(-3:3)
#> [1] 0.00135 0.02275 0.15866 0.50000 0.84134 0.97725 0.99865
print(pnorm(-3:3), digits = 3)
#> [1] 0.00135 0.02275 0.15866 0.50000 0.84134 0.97725 0.99865
q <- seq(from = 0, to = 3, by = 0.5)
tbl <- data.frame(Quant = q,
                  Lower = pnorm(-q),
                  Upper = pnorm(q))
tbl                                # Unformatted print
#>   Quant   Lower Upper
#> 1   0.0 0.50000 0.500
#> 2   0.5 0.30854 0.691
#> 3   1.0 0.15866 0.841
#> 4   1.5 0.06681 0.933
#> 5   2.0 0.02275 0.977
#> 6   2.5 0.00621 0.994
#> 7   3.0 0.00135 0.999
print(tbl, digits = 2)             # Formatted print: fewer digits
#>   Quant  Lower Upper
#> 1   0.0 0.5000  0.50
#> 2   0.5 0.3085  0.69
#> 3   1.0 0.1587  0.84
#> 4   1.5 0.0668  0.93
#> 5   2.0 0.0228  0.98
#> 6   2.5 0.0062  0.99
#> 7   3.0 0.0013  1.00
pi
#> [1] 3.14
options(digits = 15)
pi
#> [1] 3.14159265358979
cat("The answer is", answer, "\n", file = "filename.txt")
sink("filename")          # Begin writing output to file

# ... other session work ...

sink()                    # Resume writing output to console
sink("script_output.txt")   # Redirect output to file
source("script.R")          # Run the script, capturing its output
sink()                      # Resume writing output to console
cat(data, file = "analysisReport.out")
cat(results, file = "analysisRepart.out", append = TRUE)
cat(conclusion, file = "analysisReport.out", append = TRUE)
con <- file("analysisReport.out", "w")
cat(data, file = con)
cat(results, file = con)
cat(conclusion, file = con)
close(con)
list.files()
#>  [1] "_book"                            "_bookdown_files"
#>  [3] "_bookdown_files.old"              "_bookdown.yml"
#>  [5] "_common.R"                        "_main.rds"
#>  [7] "_output.yaml"                     "01_GettingStarted_cache"
#>  [9] "01_GettingStarted.md"             "01_GettingStarted.Rmd"
etc ...
list.files(path = 'data/') # show files in a directory
#>  [1] "ac.rdata"               "adf.rdata"
#>  [3] "anova.rdata"            "anova2.rdata"
#>  [5] "bad.rdata"              "batches.rdata"
#>  [7] "bnd_cmty.Rdata"         "compositePerf-2010.csv"
#>  [9] "conf.rdata"             "daily.prod.rdata"
#> [11] "data1.csv"              "data2.csv"
#> [13] "datafile_missing.tsv"   "datafile.csv"
#> [15] "datafile.fwf"           "datafile.qsv"
#> [17] "datafile.ssv"           "datafile.tsv"
#> [19] "df_decay.rdata"         "df_squared.rdata"
#> [21] "diffs.rdata"            "example1_headless.csv"
#> [23] "example1.csv"           "excel_table_data.xlsx"
#> [25] "get_USDA_NASS_data.R"   "ibm.rdata"
#> [27] "iris_excel.xlsx"        "lab_df.rdata"
#> [29] "movies.sas7bdat"        "nacho_data.csv"
#> [31] "NearestPoint.R"         "not_a_csv.txt"
#> [33] "opt.rdata"              "outcome.rdata"
#> [35] "pca.rdata"              "pred.rdata"
#> [37] "pred2.rdata"            "sat.rdata"
#> [39] "singles.txt"            "state_corn_yield.rds"
#> [41] "student_data.rdata"     "suburbs.txt"
#> [43] "tab1.csv"               "tls.rdata"
#> [45] "triples.txt"            "ts_acf.rdata"
#> [47] "workers.rdata"          "world_series.csv"
#> [49] "xy.rdata"               "yield.Rdata"
#> [51] "z.RData"
list.files(path = 'data/', pattern = '\\.csv')
#> [1] "compositePerf-2010.csv" "data1.csv"
#> [3] "data2.csv"              "datafile.csv"
#> [5] "example1_headless.csv"  "example1.csv"
#> [7] "nacho_data.csv"         "tab1.csv"
#> [9] "world_series.csv"
list.files(recursive = T)
list.files(path = 'data/', all.files = TRUE)
#>  [1] "."                      ".."
#>  [3] ".DS_Store"              ".hidden_file.txt"
#>  [5] "ac.rdata"               "adf.rdata"
#>  [7] "anova.rdata"            "anova2.rdata"
#>  [9] "bad.rdata"              "batches.rdata"
#> [11] "bnd_cmty.Rdata"         "compositePerf-2010.csv"
#> [13] "conf.rdata"             "daily.prod.rdata"
#> [15] "data1.csv"              "data2.csv"
#> [17] "datafile_missing.tsv"   "datafile.csv"
#> [19] "datafile.fwf"           "datafile.qsv"
#> [21] "datafile.ssv"           "datafile.tsv"
#> [23] "df_decay.rdata"         "df_squared.rdata"
#> [25] "diffs.rdata"            "example1_headless.csv"
#> [27] "example1.csv"           "excel_table_data.xlsx"
#> [29] "get_USDA_NASS_data.R"   "ibm.rdata"
#> [31] "iris_excel.xlsx"        "lab_df.rdata"
#> [33] "movies.sas7bdat"        "nacho_data.csv"
#> [35] "NearestPoint.R"         "not_a_csv.txt"
#> [37] "opt.rdata"              "outcome.rdata"
#> [39] "pca.rdata"              "pred.rdata"
#> [41] "pred2.rdata"            "sat.rdata"
#> [43] "singles.txt"            "state_corn_yield.rds"
#> [45] "student_data.rdata"     "suburbs.txt"
#> [47] "tab1.csv"               "tls.rdata"
#> [49] "triples.txt"            "ts_acf.rdata"
#> [51] "workers.rdata"          "world_series.csv"
#> [53] "xy.rdata"               "yield.Rdata"
#> [55] "z.RData"
samp <- read_csv("C:\Data\sample-data.csv")
#> Error: '\D' is an unrecognized escape in character string starting ""C:\D"
samp <- read_csv("C:/Data/sample-data.csv")
samp <- read_csv("C:\\Data\\sample-data.csv")
library(tidyverse)
records <- read_fwf("./data/datafile.fwf",
                    fwf_cols(
                      last = 10,
                      first = 10,
                      birth = 5,
                      death = 5
                    ))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
records
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
Fisher    R.A.      1890 1962
Pearson   Karl      1857 1936
Cox       Gertrude  1900 1978
Yates     Frank     1902 1994
Smith     Kirstine  1878 1939
file <- "./data/datafile.fwf"
t1 <- read_fwf(file, fwf_empty(file, col_names = c("last", "first", "birth", "death")))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t2 <- read_fwf(file, fwf_widths(c(10, 10, 5, 4),
                                c("last", "first", "birth", "death")))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t3 <-
  read_fwf("./data/datafile.fwf",
           fwf_cols(
             last = 10,
             first = 10,
             birth = 5,
             death = 5
           ))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t4 <- read_fwf(file, fwf_cols(
  last = c(1, 10),
  first = c(11, 20),
  birth = c(21, 25),
  death = c(26, 30)
))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t5 <- read_fwf(file, fwf_positions(
  c(1, 11, 21, 26),
  c(10, 20, 25, 30),
  c("first", "last", "birth", "death")
))
#> Parsed with column specification:
#> cols(
#>   first = col_character(),
#>   last = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
library(tidyverse)

tab1 <- read_table2("./data/datafile.tsv")
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
tab1
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
last    first   birth   death
Fisher  R.A.    1890    1962
Pearson Karl    1857    1936
Cox Gertrude    1900    1978
Yates   Frank   1902    1994
Smith   Kirstine    1878    1939
t <- read_table2("./data/datafile.ssv")
#> Parsed with column specification:
#> cols(
#>   `#The` = col_character(),
#>   following = col_character(),
#>   is = col_character(),
#>   a = col_character(),
#>   list = col_character(),
#>   of = col_character(),
#>   statisticians = col_character()
#> )
#> Warning: 6 parsing failures.
#> row col  expected    actual                  file
#>   1  -- 7 columns 4 columns './data/datafile.ssv'
#>   2  -- 7 columns 4 columns './data/datafile.ssv'
#>   3  -- 7 columns 4 columns './data/datafile.ssv'
#>   4  -- 7 columns 4 columns './data/datafile.ssv'
#>   5  -- 7 columns 4 columns './data/datafile.ssv'
#> ... ... ......... ......... .....................
#> See problems(...) for more details.
print(t)
#> # A tibble: 6 x 7
#>   `#The`  following is    a     list  of    statisticians
#>   <chr>   <chr>     <chr> <chr> <chr> <chr> <chr>
#> 1 last    first     birth death <NA>  <NA>  <NA>
#> 2 Fisher  R.A.      1890  1962  <NA>  <NA>  <NA>
#> 3 Pearson Karl      1857  1936  <NA>  <NA>  <NA>
#> 4 Cox     Gertrude  1900  1978  <NA>  <NA>  <NA>
#> 5 Yates   Frank     1902  1994  <NA>  <NA>  <NA>
#> 6 Smith   Kirstine  1878  1939  <NA>  <NA>  <NA>
t <-
  read_table2(
    "./data/datafile.tsv",
    col_types = c(
      col_character(),
      col_character(),
      col_integer(),
      col_integer()
    )
  )
last    first   birth   death
Fisher  R.A.    1890    1962
Pearson Karl    1857    1936
Cox Gertrude    1900    1978
Yates   Frank   1902    1994
Smith   Kirstine    1878    1939
Cox David 1924 .
t <- read_table2("./data/datafile_missing.tsv", na = ".")
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t
#> # A tibble: 6 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
#> 6 Cox     David     1924    NA
# The following is a list of statisticians
last first birth death
Fisher R.A. 1890 1962
Pearson Karl 1857 1936
Cox Gertrude 1900 1978
Yates Frank 1902 1994
Smith Kirstine 1878 1939
t <- read_table2("./data/datafile.ssv", comment = '#')
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
library(tidyverse)

tbl <- read_csv("./data/datafile.csv")
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
tbl <- read_csv("./data/datafile.csv",  col_names = FALSE)
#> Parsed with column specification:
#> cols(
#>   X1 = col_character(),
#>   X2 = col_character(),
#>   X3 = col_character(),
#>   X4 = col_character()
#> )
label,lbound,ubound
low,0,0.674
mid,0.674,1.64
high,1.64,2.33
tbl <- read_csv("./data/example1.csv")
#> Parsed with column specification:
#> cols(
#>   label = col_character(),
#>   lbound = col_double(),
#>   ubound = col_double()
#> )
tbl
#> # A tibble: 3 x 3
#>   label lbound ubound
#>   <chr>  <dbl>  <dbl>
#> 1 low    0      0.674
#> 2 mid    0.674  1.64
#> 3 high   1.64   2.33
tbl <- read_csv("./data/example1.csv", col_names = FALSE)
#> Parsed with column specification:
#> cols(
#>   X1 = col_character(),
#>   X2 = col_character(),
#>   X3 = col_character()
#> )
tbl
#> # A tibble: 4 x 3
#>   X1    X2     X3
#>   <chr> <chr>  <chr>
#> 1 label lbound ubound
#> 2 low   0      0.674
#> 3 mid   0.674  1.64
#> 4 high  1.64   2.33
library(tidyverse)

write_csv(tab1, path = "./data/tab1.csv")
library(tidyverse)

print(tab1)
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
write_csv(tab1, "./data/tab1.csv")
last,first,birth,death
Fisher,R.A.,1890,1962
Pearson,Karl,1857,1936
Cox,Gertrude,1900,1978
Yates,Frank,1902,1994
Smith,Kirstine,1878,1939
library(tidyverse)

berkley <- read_csv('http://bit.ly/barkley18', comment = '#')
#> Parsed with column specification:
#> cols(
#>   Name = col_character(),
#>   Location = col_character(),
#>   Time = col_time(format = "")
#> )
tbl <- read_table2("ftp://ftp.example.com/download/data.txt")
library(openxlsx)

df1 <- read.xlsx(xlsxFile = "data/iris_excel.xlsx",
                 sheet = 'iris_data')
head(df1, 3)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
library(openxlsx)
wb <- loadWorkbook("data/excel_table_data.xlsx")
tables <- getTables(wb, 'input_data')
table_range_str <- names(tables[tables == 'example_table'])
table_range_refs <- strsplit(table_range_str, ':')[[1]]

# use a regex to extract out the row numbers
table_range_row_num <- gsub("[^0-9.]", "", table_range_refs)

# extract out the column numbers
table_range_col_num <- convertFromExcelRef(table_range_refs)
df <- read.xlsx(
  xlsxFile = "data/excel_table_data.xlsx",
  sheet = 'input_data',
  cols = table_range_col_num[1]:table_range_col_num[2],
  rows = table_range_row_num[1]:table_range_row_num[2]
)
library(openxlsx)

write.xlsx(x = iris,
           sheetName = 'iris_data',
           file = "data/iris_excel.xlsx")
library(openxlsx)

wb <- loadWorkbook("data/excel_table_data.xlsx")
tables <- getTables(wb, 'input_data')
table_range_str <- names(tables[tables == 'example_table'])
table_range_refs <- strsplit(table_range_str, ':')[[1]]

# use a regex to extract out the starting row number
table_row_num <- gsub("[^0-9.]", "", table_range_refs)[[1]]

# extract out the starting column number
table_col_num <- convertFromExcelRef(table_range_refs)[[1]]
## remove the existing Excel Table
removeTable(wb = wb,
            sheet = 'input_data',
            table = 'example_table')
writeDataTable(
  wb = wb,
  sheet = 'input_data',
  x = iris,
  startCol = table_col_num,
  startRow = table_row_num,
  tableStyle = "TableStyleLight9",
  tableName = 'example_table'
)
writeData(
  wb = wb,
  sheet = 'input_data',
  x = paste('example_table data refreshed on:', Sys.time()),
  startCol = 2,
  startRow = 5
)

## then save the workbook
saveWorkbook(wb = wb,
             file = "data/excel_table_data.xlsx",
             overwrite = T)
library(haven)

sas_movie_data <- read_sas("data/movies.sas7bdat")
sapply(sas_movie_data, attributes)
#> $Movie
#> $Movie$label
#> [1] "Movie"
#>
#>
#> $Type
#> $Type$label
#> [1] "Type"
#>
#>
#> $Rating
#> $Rating$label
#> [1] "Rating"
#>
#>
#> $Year
#> $Year$label
#> [1] "Year"
#>
#>
#> $Domestic__
#> $Domestic__$label
#> [1] "Domestic $"
#>
#> $Domestic__$format.sas
#> [1] "F"
#>
#>
#> $Worldwide__
#> $Worldwide__$label
#> [1] "Worldwide $"
#>
#> $Worldwide__$format.sas
#> [1] "F"
#>
#>
#> $Director
#> $Director$label
#> [1] "Director"
library(rvest)
library(magrittr)

all_tables <-
  read_html("https://en.wikipedia.org/wiki/Aviation_accidents_and_incidents") %>%
  html_table(fill = TRUE, header = TRUE)
out_table <-
  read_html("https://en.wikipedia.org/wiki/Aviation_accidents_and_incidents") %>%
  html_table(fill = TRUE, header = TRUE) %>%
  extract2(2)

head(out_table)
#>   Year Deaths[52] # of incidents[53]
#> 1 2017        399           101 [54]
#> 2 2016        629                102
#> 3 2015        898                123
#> 4 2014      1,328                122
#> 5 2013        459                138
#> 6 2012        800                156
url <- 'http://en.wikipedia.org/wiki/World_population'
tbls <- read_html(url) %>%
  html_table(fill = TRUE, header = TRUE)
length(tbls)
#> [1] 23
library(magrittr)
url <- 'http://en.wikipedia.org/wiki/World_population'
tbl <- read_html(url) %>%
  html_table(fill = TRUE, header = TRUE) %>%
  extract2(2)

head(tbl, 2)
#>   World population (millions, UN estimates)[10]
#> 1                                             #
#> 2                                             1
#>   World population (millions, UN estimates)[10]
#> 1               Top ten most populous countries
#> 2                                        China*
#>   World population (millions, UN estimates)[10]
#> 1                                          2000
#> 2                                         1,270
#>   World population (millions, UN estimates)[10]
#> 1                                          2015
#> 2                                         1,376
#>   World population (millions, UN estimates)[10]
#> 1                                         2030*
#> 2                                         1,416
tbl[, c(2, 3)]
#>                          World population (millions, UN estimates)[10]
#> 1                                      Top ten most populous countries
#> 2                                                               China*
#> 3                                                                India
#> 4                                                        United States
#> 5                                                            Indonesia
#> 6                                                             Pakistan
#> 7                                                               Brazil
#> 8                                                              Nigeria
#> 9                                                           Bangladesh
#> 10                                                              Russia
#> 11                                                              Mexico
#> 12                                                         World total
#> 13 Notes:\nChina = excludes Hong Kong and Macau\n2030 = Medium variant
#>                        World population (millions, UN estimates)[10].1
#> 1                                                                 2000
#> 2                                                                1,270
#> 3                                                                1,053
#> 4                                                                  283
#> 5                                                                  212
#> 6                                                                  136
#> 7                                                                  176
#> 8                                                                  123
#> 9                                                                  131
#> 10                                                                 146
#> 11                                                                 103
#> 12                                                               6,127
#> 13 Notes:\nChina = excludes Hong Kong and Macau\n2030 = Medium variant
lines <- readLines("input.txt")
lines <- readLines("input.txt", n = 10)       # Read 10 lines and stop
2355.09 2246.73 1738.74 1841.01 2027.85
singles <- scan("./data/singles.txt", what = numeric(0))
singles
#> [1] 2355.09 2246.73 1738.74 1841.01 2027.85
15-Oct-87 2439.78 2345.63 16-Oct-87 2396.21 2207.73
19-Oct-87 2164.16 1677.55 20-Oct-87 2067.47 1616.21
21-Oct-87 2081.07 1951.76
triples <-
  scan("./data/triples.txt",
       what = list(character(0), numeric(0), numeric(0)))
triples
#> [[1]]
#> [1] "15-Oct-87" "16-Oct-87" "19-Oct-87" "20-Oct-87" "21-Oct-87"
#>
#> [[2]]
#> [1] 2439.78 2396.21 2164.16 2067.47 2081.07
#>
#> [[3]]
#> [1] 2345.63 2207.73 1677.55 1616.21 1951.76
triples <- scan("./data/triples.txt",
                what = list(
                  date = character(0),
                  high = numeric(0),
                  low = numeric(0)
                ))
triples
#> $date
#> [1] "15-Oct-87" "16-Oct-87" "19-Oct-87" "20-Oct-87" "21-Oct-87"
#>
#> $high
#> [1] 2439.78 2396.21 2164.16 2067.47 2081.07
#>
#> $low
#> [1] 2345.63 2207.73 1677.55 1616.21 1951.76
df_triples <- data.frame(triples)
df_triples
#>        date    high     low
#> 1 15-Oct-87 2439.78 2345.63
#> 2 16-Oct-87 2396.21 2207.73
#> 3 19-Oct-87 2164.16 1677.55
#> 4 20-Oct-87 2067.47 1616.21
#> 5 21-Oct-87 2081.07 1951.76
1903  LWLlwwwW    1927  wwWW      1950  wwWW      1973  WLwllWW
1905  wLwWW       1928  WWww      1951  LWlwwW    1974  wlWWW
1906  wLwLwW      1929  wwLWW     1952  lwLWLww   1975  lwWLWlw
1907  WWww        1930  WWllwW    1953  WWllwW    1976  WWww
1908  wWLww       1931  LWwlwLW   1954  WWww      1977  WLwwlW

.
. (etc.)
.
# Read the wseries dataset:
#     - Skip the first 35 lines
#     - Then read 23 lines of data
#     - The data occurs in pairs: a year and a pattern (char string)
#
world.series <- scan(
  "http://lib.stat.cmu.edu/datasets/wseries",
  skip = 35,
  nlines = 23,
  what = list(year = integer(0),
              pattern = character(0)),
)
world.series$year
#>  [1] 1903 1927 1950 1973 1905 1928 1951 1974 1906 1929 1952 1975 1907 1930
#> [15] 1953 1976 1908 1931 1954 1977 1909 1932 1955 1978 1910 1933 1956 1979
#> [29] 1911 1934 1957 1980 1912 1935 1958 1981 1913 1936 1959 1982 1914 1937
#> [43] 1960 1983 1915 1938 1961 1984 1916 1939 1962 1985 1917 1940 1963 1986
#> [57] 1918 1941 1964 1987 1919 1942 1965 1988 1920 1943 1966 1989 1921 1944
#> [71] 1967 1990 1922 1945 1968 1991 1923 1946 1969 1992 1924 1947 1970 1993
#> [85] 1925 1948 1971 1926 1949 1972
perm <- order(world.series$year)
world.series <- list(year    = world.series$year[perm],
                     pattern = world.series$pattern[perm])
world.series$year
#>  [1] 1903 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917
#> [15] 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
#> [29] 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945
#> [43] 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959
#> [57] 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
#> [71] 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987
#> [85] 1988 1989 1990 1991 1992 1993

world.series$pattern
#>  [1] "LWLlwwwW" "wLwWW"    "wLwLwW"   "WWww"     "wWLww"    "WLwlWlw"
#>  [7] "WWwlw"    "lWwWlW"   "wLwWlLW"  "wLwWw"    "wwWW"     "lwWWw"
#> [13] "WWlwW"    "WWllWw"   "wlwWLW"   "WWlwwLLw" "wllWWWW"  "LlWwLwWw"
#> [19] "WWwW"     "LwLwWw"   "LWlwlWW"  "LWllwWW"  "lwWLLww"  "wwWW"
#> [25] "WWww"     "wwLWW"    "WWllwW"   "LWwlwLW"  "WWww"     "WWlww"
#> [31] "wlWLLww"  "LWwwlW"   "lwWWLw"   "WWwlw"    "wwWW"     "WWww"
#> [37] "LWlwlWW"  "WLwww"    "LWwww"    "WLWww"    "LWlwwW"   "LWLwwlw"
#> [43] "LWlwlww"  "WWllwLW"  "lwWWLw"   "WLwww"    "wwWW"     "LWlwwW"
#> [49] "lwLWLww"  "WWllwW"   "WWww"     "llWWWlw"  "llWWWlw"  "lwLWWlw"
#> [55] "llWLWww"  "lwWWLw"   "WLlwwLW"  "WLwww"    "wlWLWlw"  "wwWW"
#> [61] "WLlwwLW"  "llWWWlw"  "wwWW"     "wlWWLlw"  "lwLLWww"  "lwWWW"
#> [67] "wwWLW"    "llWWWlw"  "wwLWLlw"  "WLwllWW"  "wlWWW"    "lwWLWlw"
#> [73] "WWww"     "WLwwlW"   "llWWWw"   "lwLLWww"  "WWllwW"   "llWWWw"
#> [79] "LWwllWW"  "LWwww"    "wlWWW"    "LLwlwWW"  "LLwwlWW"  "WWlllWW"
#> [85] "WWlww"    "WWww"     "WWww"     "WWlllWW"  "lwWWLw"   "WLwwlW"
library(RMySQL)

con <- dbConnect(
    drv = RMySQL::MySQL(),
    dbname = "your_db_name",
    host = "your.host.com",
    username = "userid",
    password = "pwd"
  )
[client]
user = userid
password = password
host = hostname
con <- dbConnect(dbConnect(
  drv = RMySQL::MySQL(),
  dbname = "your_db_name",
  host = "your.host.com"
)
sql <- "SELECT * from SurveyResults WHERE City = 'Chicago'"
rows <- dbGetQuery(con, sql)
if (dbMoreResults(con)) dbNextResult(con)
dbDisconnect(con)
con <- dbConnect(MySQL(), client.flag = CLIENT_MULTI_RESULTS)
sql <- paste(
  "select * from DailyBar where Symbol = 'IBM'",
  "and Day between '2008-12-29' and '2008-12-31'"
)
rows <- dbGetQuery(con, sql)
if (dbMoreResults(con)) {
  dbNextResults(con)
}
dbDisconnect(con)
print(rows)
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
sleep_db <- copy_to(con, msleep, "sleep")
sleep_table <- tbl(con, "sleep")
little_sleep <- sleep_table %>%
  select(name, genus, order, sleep_total) %>%
  filter(sleep_total < 3)
show_query(little_sleep)
#> <SQL>
#> SELECT *
#> FROM (SELECT `name`, `genus`, `order`, `sleep_total`
#> FROM `sleep`)
#> WHERE (`sleep_total` < 3.0)
local_little_sleep <- collect(little_sleep)
local_little_sleep
#> # A tibble: 3 x 4
#>   name        genus         order          sleep_total
#>   <chr>       <chr>         <chr>                <dbl>
#> 1 Horse       Equus         Perissodactyla         2.9
#> 2 Giraffe     Giraffa       Artiodactyla           1.9
#> 3 Pilot whale Globicephalus Cetacea                2.7
save(tbl, t, file = "myData.RData")
load("myData.RData")
dput(tbl, file = "myData.txt")
dump("tbl", file = "myData.txt")    # Note quotes around variable name
myData <- load("myData.RData")     # Achtung! Might not do what you think
myData
#> [1] "tbl" "t"
str(myData)
#>  chr [1:2] "tbl" "t"
load("./data/z.RData")   # Create and populate the z variable
plot(z)                  # Does not plot as expected: zoo pkg not loaded
library(zoo)                  # Load the zoo package into memory
load("./data/z.RData") # Create and populate the z variable
plot(z)                       # Ahhh. Now plotting works correctly
v <- c(10, 20, 30)
names(v) <- c("Moe", "Larry", "Curly")
print(v)
#>   Moe Larry Curly
#>    10    20    30
v["Larry"]
#> Larry
#>    20
mode(3.1415)                        # Mode of a number
#> [1] "numeric"
mode(c(2.7182, 3.1415))             # Mode of a vector of numbers
#> [1] "numeric"
mode("Moe")                         # Mode of a character string
#> [1] "character"
mode(list("Moe", "Larry", "Curly")) # Mode of a list
#> [1] "list"
d <- as.Date("2010-03-15")
mode(d)
#> [1] "numeric"
length(d)
#> [1] 1
class(d)
#> [1] "Date"
pi
#> [1] 3.14
length(pi)
#> [1] 1
pi[1]
#> [1] 3.14
pi[2]
#> [1] NA
A <- 1:6
dim(A)
#> NULL
print(A)
#> [1] 1 2 3 4 5 6
dim(A) <- c(2, 3)
print(A)
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6
B <- list(1, 2, 3, 4, 5, 6)
dim(B)
#> NULL
dim(B) <- c(2, 3)
print(B)
#>      [,1] [,2] [,3]
#> [1,] 1    3    5
#> [2,] 2    4    6
D <- 1:12
dim(D) <- c(2, 3, 2)
print(D)
#> , , 1
#>
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6
#>
#> , , 2
#>
#>      [,1] [,2] [,3]
#> [1,]    7    9   11
#> [2,]    8   10   12
C <- list(1, 2, 3, "X", "Y", "Z")
dim(C) <- c(2, 3)
print(C)
#>      [,1] [,2] [,3]
#> [1,] 1    3    "Y"
#> [2,] 2    "X"  "Z"
v <- c(1, 2, 3)
newItems <- c(6, 7, 8)
v <- c(v, newItems)
v
#> [1] 1 2 3 6 7 8
v[length(v) + 1] <- 42
v
#> [1]  1  2  3  6  7  8 42
v <- c(1, 2, 3)
v <- c(v, 4) # Append a single value to v
v
#> [1] 1 2 3 4

w <- c(5, 6, 7, 8)
v <- c(v, w) # Append an entire vector to v
v
#> [1] 1 2 3 4 5 6 7 8
v <- c(1, 2, 3) # Create a vector of three elements
v[10] <- 10 # Assign to the 10th element
v # R extends the vector automatically
#>  [1]  1  2  3 NA NA NA NA NA NA 10
v
#>  [1]  1  2  3 NA NA NA NA NA NA 10
newvalues <- c(100, 101)
n <- 2
append(v, newvalues, after = n)
#>  [1]   1   2 100 101   3  NA  NA  NA  NA  NA  NA  10
append(1:10, 99, after = 5)
#>  [1]  1  2  3  4  5 99  6  7  8  9 10
append(1:10, 99, after = 0)
#>  [1] 99  1  2  3  4  5  6  7  8  9 10
   1:6   1:3
  ----- -----
    1     1
    2     2
    3     3
    4
    5
    6
   1:6   1:3   (1:6) + (1:3)
  ----- ----- ---------------
    1     1         2
    2     2         4
    3     3         6
    4               5
    5               7
    6               9
(1:6) + (1:3)
#> [1] 2 4 6 5 7 9
r}
cbind(1:6)

cbind(1:3)
cbind(1:6, 1:3)
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    2    2
#> [3,]    3    3
#> [4,]    4    1
#> [5,]    5    2
#> [6,]    6    3
(1:6) + (1:5) # Oops! 1:5 is one element too short
#> Warning in (1:6) + (1:5): longer object length is not a multiple of shorter
#> object length
#> [1]  2  4  6  8 10  7
(1:6) + 10
#> [1] 11 12 13 14 15 16
v <- c("dog", "cat", "mouse", "rat", "dog")
f <- factor(v) # v can be a vector of strings or integers
f
#> [1] dog   cat   mouse rat   dog
#> Levels: cat dog mouse rat
str(f)
#>  Factor w/ 4 levels "cat","dog","mouse",..: 2 1 3 4 2
v <- c("dog", "cat", "mouse", "rat", "dog")
f <- factor(v, levels = c("dog", "cat", "mouse", "rat", "horse"))
f
#> [1] dog   cat   mouse rat   dog
#> Levels: dog cat mouse rat horse
str(f)
#>  Factor w/ 5 levels "dog","cat","mouse",..: 1 2 3 4 1
f <- factor(c("Win", "Win", "Lose", "Tie", "Win", "Lose"))
f
#> [1] Win  Win  Lose Tie  Win  Lose
#> Levels: Lose Tie Win
wday <- c("Wed", "Thu", "Mon", "Wed", "Thu", "Thu", "Thu", "Tue", "Thu", "Tue")
f <- factor(wday)
f
#>  [1] Wed Thu Mon Wed Thu Thu Thu Tue Thu Tue
#> Levels: Mon Thu Tue Wed
f <- factor(wday, c("Mon", "Tue", "Wed", "Thu", "Fri"))
f
#>  [1] Wed Thu Mon Wed Thu Thu Thu Tue Thu Tue
#> Levels: Mon Tue Wed Thu Fri
v1 <- c(1, 2, 3)
v2 <- c(4, 5, 6)
v3 <- c(7, 8, 9)
comb <- stack(list(v1 = v1, v2 = v2, v3 = v3)) # Combine 3 vectors
comb
#>   values ind
#> 1      1  v1
#> 2      2  v1
#> 3      3  v1
#> 4      4  v2
#> 5      5  v2
#> 6      6  v2
#> 7      7  v3
#> 8      8  v3
#> 9      9  v3
set.seed(2)
n <- 5
freshmen <- sample(1:5, n, replace = TRUE, prob = c(.6, .2, .1, .05, .05))
sophomores <- sample(1:5, n, replace = TRUE, prob = c(.05, .2, .6, .1, .05))
juniors <- sample(1:5, n, replace = TRUE, prob = c(.05, .2, .55, .15, .05))

comb <- stack(list(fresh = freshmen, soph = sophomores, jrs = juniors))
print(comb)
#>    values   ind
#> 1       1 fresh
#> 2       2 fresh
#> 3       1 fresh
#> 4       1 fresh
#> 5       5 fresh
#> 6       5  soph
#> 7       3  soph
#> 8       4  soph
#> 9       3  soph
#> 10      3  soph
#> 11      2   jrs
#> 12      3   jrs
#> 13      4   jrs
#> 14      3   jrs
#> 15      3   jrs
aov(values ~ ind, data = comb)
#> Call:
#>    aov(formula = values ~ ind, data = comb)
#>
#> Terms:
#>                   ind Residuals
#> Sum of Squares   6.53     17.20
#> Deg. of Freedom     2        12
#>
#> Residual standard error: 1.2
#> Estimated effects may be unbalanced
x <- c("a", "b", "c")
y <- c(1, 2, 3)
z <- "why be normal?"
lst <- list(x, y, z)
lst
#> [[1]]
#> [1] "a" "b" "c"
#>
#> [[2]]
#> [1] 1 2 3
#>
#> [[3]]
#> [1] "why be normal?"
lst <- list(0.5, 0.841, 0.977)
lst
#> [[1]]
#> [1] 0.5
#>
#> [[2]]
#> [1] 0.841
#>
#> [[3]]
#> [1] 0.977
lst <- list(3.14, "Moe", c(1, 1, 2, 3), mean)
lst
#> [[1]]
#> [1] 3.14
#>
#> [[2]]
#> [1] "Moe"
#>
#> [[3]]
#> [1] 1 1 2 3
#>
#> [[4]]
#> function (x, ...)
#> UseMethod("mean")
#> <bytecode: 0x7f8f0457ff88>
#> <environment: namespace:base>
lst <- list()
lst[[1]] <- 3.14
lst[[2]] <- "Moe"
lst[[3]] <- c(1, 1, 2, 3)
lst[[4]] <- mean
lst
#> [[1]]
#> [1] 3.14
#>
#> [[2]]
#> [1] "Moe"
#>
#> [[3]]
#> [1] 1 1 2 3
#>
#> [[4]]
#> function (x, ...)
#> UseMethod("mean")
#> <bytecode: 0x7f8f0457ff88>
#> <environment: namespace:base>
lst <- list(mid = 0.5, right = 0.841, far.right = 0.977)
lst
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
years <- list(1960, 1964, 1976, 1994)
years
#> [[1]]
#> [1] 1960
#>
#> [[2]]
#> [1] 1964
#>
#> [[3]]
#> [1] 1976
#>
#> [[4]]
#> [1] 1994
years[[1]]
years[c(1, 2)]
#> [[1]]
#> [1] 1960
#>
#> [[2]]
#> [1] 1964
class(years[[1]])
#> [1] "numeric"

class(years[1])
#> [1] "list"
cat(years[[1]], "\n")
#> 1960

cat(years[1], "\n")
#> Error in cat(years[1], "\n"): argument 1 (type 'list') cannot be handled by 'cat'
years <- list(Kennedy = 1960, Johnson = 1964, Carter = 1976, Clinton = 1994)
years[["Kennedy"]]
#> [1] 1960
years$Kennedy
#> [1] 1960
years[c("Kennedy", "Johnson")]
#> $Kennedy
#> [1] 1960
#>
#> $Johnson
#> [1] 1964

years["Carter"]
#> $Carter
#> [1] 1976
lst <- list(mid = 0.5, right = 0.841, far.right = 0.977)
lst
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
values <- c(1, 2, 3)
names <- c("a", "b", "c")
lst <- list()
lst[names] <- values
lst
#> $a
#> [1] 1
#>
#> $b
#> [1] 2
#>
#> $c
#> [1] 3
lst <- list(
  far.left = 0.023,
  left = 0.159,
  mid = 0.500,
  right = 0.841,
  far.right = 0.977
)
lst
#> $far.left
#> [1] 0.023
#>
#> $left
#> [1] 0.159
#>
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
lst <- list()
lst$far.left <- 0.023
lst$left <- 0.159
lst$mid <- 0.500
lst$right <- 0.841
lst$far.right <- 0.977
lst
#> $far.left
#> [1] 0.023
#>
#> $left
#> [1] 0.159
#>
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
values <- pnorm(-2:2)
names <- c("far.left", "left", "mid", "right", "far.right")
lst <- list()
lst[names] <- values
cat("The left limit is", lst[["left"]], "\n")
#> The left limit is 0.159
cat("The right limit is", lst[["right"]], "\n")
#> The right limit is 0.841

for (nm in names(lst)) cat("The", nm, "limit is", lst[[nm]], "\n")
#> The far.left limit is 0.0228
#> The left limit is 0.159
#> The mid limit is 0.5
#> The right limit is 0.841
#> The far.right limit is 0.977
years <- list(Kennedy = 1960, Johnson = 1964, Carter = 1976, Clinton = 1994)
years
#> $Kennedy
#> [1] 1960
#>
#> $Johnson
#> [1] 1964
#>
#> $Carter
#> [1] 1976
#>
#> $Clinton
#> [1] 1994
years[["Johnson"]] <- NULL # Remove the element labeled "Johnson"
years
#> $Kennedy
#> [1] 1960
#>
#> $Carter
#> [1] 1976
#>
#> $Clinton
#> [1] 1994
years[c("Carter", "Clinton")] <- NULL # Remove two elements
years
#> $Kennedy
#> [1] 1960
iq.scores <- list(rnorm(5, 100, 15))
iq.scores
#> [[1]]
#> [1] 115.8  88.7  78.4  95.7  84.5
mean(iq.scores)
#> Warning in mean.default(iq.scores): argument is not numeric or logical:
#> returning NA
#> [1] NA
mean(unlist(iq.scores))
#> [1] 92.6
cat(iq.scores, "\n")
#> Error in cat(iq.scores, "\n"): argument 1 (type 'list') cannot be handled by 'cat'
cat("IQ Scores:", unlist(iq.scores), "\n")
#> IQ Scores: 116 88.7 78.4 95.7 84.5
lst <- list(1, NULL, 2, 3, NULL, 4)
lst
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> [1] 2
#>
#> [[4]]
#> [1] 3
#>
#> [[5]]
#> NULL
#>
#> [[6]]
#> [1] 4
lst[sapply(lst, is.null)] <- NULL
lst
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> [1] 2
#>
#> [[3]]
#> [1] 3
#>
#> [[4]]
#> [1] 4
lst <- list("Moe", NULL, "Curly") # Create list with NULL element
lst
#> [[1]]
#> [1] "Moe"
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> [1] "Curly"

lst[sapply(lst, is.null)] <- NULL # Remove NULL element from list
lst
#> [[1]]
#> [1] "Moe"
#>
#> [[2]]
#> [1] "Curly"
lst <- as.list(rnorm(7))
lst
#> [[1]]
#> [1] -0.0281
#>
#> [[2]]
#> [1] -0.366
#>
#> [[3]]
#> [1] -1.12
#>
#> [[4]]
#> [1] -0.976
#>
#> [[5]]
#> [1] 1.12
#>
#> [[6]]
#> [1] 0.324
#>
#> [[7]]
#> [1] -0.568

lst[lst < 0] <- NULL
lst
#> [[1]]
#> [1] 1.12
#>
#> [[2]]
#> [1] 0.324
list(rnorm(7))
#> [[1]]
#> [1] -1.034 -0.533 -0.981  0.823 -0.388  0.879 -2.178
lst[lst == 0] <- NULL
lst[is.na(lst)] <- NULL
lst[abs(lst) < 1] <- NULL
#> Error in abs(lst): non-numeric argument to mathematical function
lst
#> [[1]]
#> [1] 1.12
#>
#> [[2]]
#> [1] 0.324
lst[abs(unlist(lst)) < 1] <- NULL
lst
#> [[1]]
#> [1] 1.12
lst <- as.list(rnorm(5))
lst
#> [[1]]
#> [1] 1.47
#>
#> [[2]]
#> [1] 0.885
#>
#> [[3]]
#> [1] 2.29
#>
#> [[4]]
#> [1] 0.554
#>
#> [[5]]
#> [1] 1.21
lst[lapply(lst, abs) < 1] <- NULL
lst
#> [[1]]
#> [1] 1.47
#>
#> [[2]]
#> [1] 2.29
#>
#> [[3]]
#> [1] 1.21
x <- 1:10
y1 <- 2 * x + rnorm(10, 0, 1)
y2 <- 3 * x + rnorm(10, 0, 8)

result_list <- list(lm(x ~ y1), lm(x ~ y2))

result_list[sapply(result_list, function(m) summary(m)$r.squared < 0.7)] <- NULL
sapply(result_list, function(m) summary(m)$r.squared)
#> [1] 0.990 0.708
lapply(result_list, function(m) summary(m)$r.squared)
#> [[1]]
#> [1] 0.99
#>
#> [[2]]
#> [1] 0.708
vec <- 1:6
matrix(vec, 2, 3)
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6
matrix(0, 2, 3) # Create an all-zeros matrix
#>      [,1] [,2] [,3]
#> [1,]    0    0    0
#> [2,]    0    0    0

matrix(NA, 2, 3) # Create a matrix populated with NA
#>      [,1] [,2] [,3]
#> [1,]   NA   NA   NA
#> [2,]   NA   NA   NA
mat <- matrix(c(1.1, 1.2, 1.3, 2.1, 2.2, 2.3), 2, 3)
mat
#>      [,1] [,2] [,3]
#> [1,]  1.1  1.3  2.2
#> [2,]  1.2  2.1  2.3
theData <- c(
  1.1, 1.2, 1.3,
  2.1, 2.2, 2.3
)
mat <- matrix(theData, 2, 3, byrow = TRUE)
mat
#>      [,1] [,2] [,3]
#> [1,]  1.1  1.2  1.3
#> [2,]  2.1  2.2  2.3
mat <- matrix(c(
  1.1, 1.2, 1.3,
  2.1, 2.2, 2.3
),
2, 3,
byrow = TRUE
)
v <- c(1.1, 1.2, 1.3, 2.1, 2.2, 2.3)
dim(v) <- c(2, 3)
v
#>      [,1] [,2] [,3]
#> [1,]  1.1  1.3  2.2
#> [2,]  1.2  2.1  2.3
theData <- c(
  1.1, 1.2, 1.3,
  2.1, 2.2, 2.3,
  3.1, 3.2, 3.3
)
mat <- matrix(theData, 3, 3, byrow = TRUE)

rownames(mat) <- c("rowname1", "rowname2", "rowname3")
colnames(mat) <- c("colname1", "colname2", "colname3")
mat
#>          colname1 colname2 colname3
#> rowname1      1.1      1.2      1.3
#> rowname2      2.1      2.2      2.3
#> rowname3      3.1      3.2      3.3
library("quantmod")
#> Loading required package: xts
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#>     as.Date, as.Date.numeric
#>
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#>
#>     first, last
#> Loading required package: TTR
#> Version 0.4-0 included new data defaults. See ?getSymbols.

getSymbols(c("AAPL", "MSFT", "GOOG"), auto.assign = TRUE)
#> 'getSymbols' currently uses auto.assign=TRUE by default, but will
#> use auto.assign=FALSE in 0.5-0. You will still be able to use
#> 'loadSymbols' to automatically load data. getOption("getSymbols.env")
#> and getOption("getSymbols.auto.assign") will still be checked for
#> alternate defaults.
#>
#> This message is shown once per session and may be disabled by setting
#> options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
#>
#> WARNING: There have been significant changes to Yahoo Finance data.
#> Please see the Warning section of '?getSymbols.yahoo' for details.
#>
#> This message is shown once per session and may be disabled by setting
#> options("getSymbols.yahoo.warning"=FALSE).
#> [1] "AAPL" "MSFT" "GOOG"
cor_mat <- cor(cbind(
  periodReturn(AAPL, period = "daily", subset = "2017"),
  periodReturn(MSFT, period = "daily", subset = "2017"),
  periodReturn(GOOG, period = "daily", subset = "2017")
))
cor_mat
#>                 daily.returns daily.returns.1 daily.returns.2
#> daily.returns           1.000           0.438           0.489
#> daily.returns.1         0.438           1.000           0.619
#> daily.returns.2         0.489           0.619           1.000
colnames(cor_mat) <- c("AAPL", "MSFT", "GOOG")
rownames(cor_mat) <- c("AAPL", "MSFT", "GOOG")
cor_mat
#>       AAPL  MSFT  GOOG
#> AAPL 1.000 0.438 0.489
#> MSFT 0.438 1.000 0.619
#> GOOG 0.489 0.619 1.000
cor_mat["MSFT", "GOOG"] # What is the correlation between MSFT and GOOG?
#> [1] 0.619
mat[1, ] # First row
#> colname1 colname2 colname3
#>      1.1      1.2      1.3
mat[, 3] # Third column
#> rowname1 rowname2 rowname3
#>      1.3      2.3      3.3
mat[1, , drop = FALSE] # First row in a one-row matrix
#>          colname1 colname2 colname3
#> rowname1      1.1      1.2      1.3
mat[, 3, drop = FALSE] # Third column in a one-column matrix
#>          colname3
#> rowname1      1.3
#> rowname2      2.3
#> rowname3      3.3
mat[1, ]
#> colname1 colname2 colname3
#>      1.1      1.2      1.3

mat[, 3]
#> rowname1 rowname2 rowname3
#>      1.3      2.3      3.3
mat[1, , drop = FALSE]
#>          colname1 colname2 colname3
#> rowname1      1.1      1.2      1.3
mat[, 3, drop = FALSE]
#>          colname3
#> rowname1      1.3
#> rowname2      2.3
#> rowname3      3.3
v1 <- 1:5
v2 <- 6:10
v3 <- c("A", "B", "C", "D", "E")
f1 <- factor(c("a", "a", "a", "b", "b"))
df <- data.frame(v1, v2, v3, f1)
df
#>   v1 v2 v3 f1
#> 1  1  6  A  a
#> 2  2  7  B  a
#> 3  3  8  C  a
#> 4  4  9  D  b
#> 5  5 10  E  b
list.of.vectors <- list(v1 = v1, v2 = v2, v3 = v3, f1 = f1)
df2 <- as.data.frame(list.of.vectors)
df2
#>   v1 v2 v3 f1
#> 1  1  6  A  a
#> 2  2  7  B  a
#> 3  3  8  C  a
#> 4  4  9  D  b
#> 5  5 10  E  b
pred1 <- rnorm(10)
pred2 <- rnorm(10, 1, 2)
pred3 <- sample(c("AM", "PM"), 10, replace = TRUE)
resp <- 2.1 + pred1 * .3 + pred2 * .9
df <- data.frame(pred1, pred2, pred3, resp)
df
#>     pred1   pred2 pred3 resp
#> 1  -0.117 -0.0196    AM 2.05
#> 2  -1.133  0.1529    AM 1.90
#> 3   0.632  3.8004    AM 5.71
#> 4   0.188  4.5922    AM 6.29
#> 5   0.892  1.8556    AM 4.04
#> 6  -1.224  2.8140    PM 4.27
#> 7   0.174  0.4908    AM 2.59
#> 8  -0.689 -0.1335    PM 1.77
#> 9   1.204 -0.0482    AM 2.42
#> 10  0.697  2.2268    PM 4.31
df <- data.frame(p1 = pred1, p2 = pred2, p3 = pred3, r = resp)
head(df, 3)
#>       p1      p2 p3    r
#> 1 -0.117 -0.0196 AM 2.05
#> 2 -1.133  0.1529 AM 1.90
#> 3  0.632  3.8004 AM 5.71
tib <- as_tibble(list(p1 = pred1, p2 = pred2, p3 = pred3, r = resp))
tib
#> # A tibble: 10 x 4
#>       p1      p2 p3        r
#>    <dbl>   <dbl> <chr> <dbl>
#> 1 -0.117 -0.0196 AM     2.05
#> 2 -1.13   0.153  AM     1.90
#> 3  0.632  3.80   AM     5.71
#> 4  0.188  4.59   AM     6.29
#> 5  0.892  1.86   AM     4.04
#> 6 -1.22   2.81   PM     4.27
#> # ... with 4 more rows
r1 <- data.frame(a = 1, b = 2, c = "a")
r2 <- data.frame(a = 3, b = 4, c = "b")
r3 <- data.frame(a = 5, b = 6, c = "c")
obs <- list(r1, r2, r3)
df <- do.call(rbind, obs)
df
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
#> 3 5 6 c
rbind(obs[[1]], obs[[2]])
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
do.call(rbind, obs)
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
#> 3 5 6 c
l1 <- list(a = 1, b = 2, c = "a")
l2 <- list(a = 3, b = 4, c = "b")
l3 <- list(a = 5, b = 6, c = "c")
obs <- list(l1, l2, l3)
df <- do.call(rbind, Map(as.data.frame, obs))
df
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
#> 3 5 6 c
r1 <- 1:3
r2 <- 6:8
r3 <- rnorm(3)
obs <- list(r1, r2, r3)
df <- do.call(rbind, obs)
df
#>        [,1]   [,2] [,3]
#> [1,]  1.000  2.000  3.0
#> [2,]  6.000  7.000  8.0
#> [3,] -0.945 -0.547  1.6
data.frame(a = 1, b = 2, c = "a", stringsAsFactors = FALSE)
#>   a b c
#> 1 1 2 a
## same set up as in the previous examples
l1 <- list( a=1, b=2, c='a' )
l2 <- list( a=3, b=4, c='b' )
l3 <- list( a=5, b=6, c='c' )
obs <- list(l1, l2, l3)
df <- do.call(rbind,Map(as.data.frame,obs))
# yes, you could use stringsAsFactors=FALSE above, but we're assuming the data.frame
# came to you with factors already

i <- sapply(df, is.factor)             ## determine which columns are factors
df[i] <- lapply(df[i], as.character)   ## turn only the factors to characters
df
newRow <- data.frame(city = "West Dundee", county = "Kane", state = "IL", pop = 5428)
library(tidyverse)
suburbs <- read_csv("./data/suburbs.txt")
#> Parsed with column specification:
#> cols(
#>   city = col_character(),
#>   county = col_character(),
#>   state = col_character(),
#>   pop = col_double()
#> )

suburbs2 <- rbind(suburbs, newRow)
suburbs2
#> # A tibble: 18 x 4
#>   city    county   state     pop
#>   <chr>   <chr>    <chr>   <dbl>
#> 1 Chicago Cook     IL    2853114
#> 2 Kenosha Kenosha  WI      90352
#> 3 Aurora  Kane     IL     171782
#> 4 Elgin   Kane     IL      94487
#> 5 Gary    Lake(IN) IN     102746
#> 6 Joliet  Kendall  IL     106221
#> # ... with 12 more rows
suburbs3 <- rbind(suburbs, data.frame(city = "West Dundee", county = "Kane", state = "IL", pop = 5428))
suburbs4 <- rbind(
  suburbs,
  data.frame(city = "West Dundee", county = "Kane", state = "IL", pop = 5428),
  data.frame(city = "East Dundee", county = "Kane", state = "IL", pop = 2955)
)
str(suburbs)
#> Classes 'tbl_df', 'tbl' and 'data.frame':    17 obs. of  4 variables:
#>  $ city  : chr  "Chicago" "Kenosha" "Aurora" "Elgin" ...
#>  $ county: chr  "Cook" "Kenosha" "Kane" "Kane" ...
#>  $ state : chr  "IL" "WI" "IL" "IL" ...
#>  $ pop   : num  2853114 90352 171782 94487 102746 ...
#>  - attr(*, "spec")=
#>   .. cols(
#>   ..   city = col_character(),
#>   ..   county = col_character(),
#>   ..   state = col_character(),
#>   ..   pop = col_double()
#>   .. )
str(newRow)
#> 'data.frame':    1 obs. of  4 variables:
#>  $ city  : Factor w/ 1 level "West Dundee": 1
#>  $ county: Factor w/ 1 level "Kane": 1
#>  $ state : Factor w/ 1 level "IL": 1
#>  $ pop   : num 5428
rbind(some_tibble, some_data.frame)
rbind(some_data.frame, some_tibble)
n <- 5
df <- data.frame(colname1 = numeric(n), colname2 = character(n))
n <- 1000000
df <- data.frame(
  dosage = numeric(n),
  lab = character(n),
  response = numeric(n),
  stringsAsFactors = FALSE
)
str(df)
#> 'data.frame':    1000000 obs. of  3 variables:
#>  $ dosage  : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ lab     : chr  "" "" "" "" ...
#>  $ response: num  0 0 0 0 0 0 0 0 0 0 ...
n <- 1000000
df <- data.frame(
  dosage = numeric(n),
  lab = factor(n, levels = c("NJ", "IL", "CA")),
  response = numeric(n)
)
str(df)
#> 'data.frame':    1000000 obs. of  3 variables:
#>  $ dosage  : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ lab     : Factor w/ 3 levels "NJ","IL","CA": NA NA NA NA NA NA NA NA NA NA ...
#>  $ response: num  0 0 0 0 0 0 0 0 0 0 ...
suburbs <- read_csv("./data/suburbs.txt")
#> Parsed with column specification:
#> cols(
#>   city = col_character(),
#>   county = col_character(),
#>   state = col_character(),
#>   pop = col_double()
#> )
suburbs
#> # A tibble: 17 x 4
#>   city    county   state     pop
#>   <chr>   <chr>    <chr>   <dbl>
#> 1 Chicago Cook     IL    2853114
#> 2 Kenosha Kenosha  WI      90352
#> 3 Aurora  Kane     IL     171782
#> 4 Elgin   Kane     IL      94487
#> 5 Gary    Lake(IN) IN     102746
#> 6 Joliet  Kendall  IL     106221
#> # ... with 11 more rows
suburbs[[1]]
#>  [1] "Chicago"           "Kenosha"           "Aurora"
#>  [4] "Elgin"             "Gary"              "Joliet"
#>  [7] "Naperville"        "Arlington Heights" "Bolingbrook"
#> [10] "Cicero"            "Evanston"          "Hammond"
#> [13] "Palatine"          "Schaumburg"        "Skokie"
#> [16] "Waukegan"          "West Dundee"
suburbs[1]
#> # A tibble: 17 x 1
#>   city
#>   <chr>
#> 1 Chicago
#> 2 Kenosha
#> 3 Aurora
#> 4 Elgin
#> 5 Gary
#> 6 Joliet
#> # ... with 11 more rows
suburbs %>%
  dplyr::select(1)
#> # A tibble: 17 x 1
#>   city
#>   <chr>
#> 1 Chicago
#> 2 Kenosha
#> 3 Aurora
#> 4 Elgin
#> 5 Gary
#> 6 Joliet
#> # ... with 11 more rows
suburbs %>%
  dplyr::select(1, 4)
#> # A tibble: 17 x 2
#>   city        pop
#>   <chr>     <dbl>
#> 1 Chicago 2853114
#> 2 Kenosha   90352
#> 3 Aurora   171782
#> 4 Elgin     94487
#> 5 Gary     102746
#> 6 Joliet   106221
#> # ... with 11 more rows
suburbs[c(1, 3)]
#> # A tibble: 17 x 2
#>   city    state
#>   <chr>   <chr>
#> 1 Chicago IL
#> 2 Kenosha WI
#> 3 Aurora  IL
#> 4 Elgin   IL
#> 5 Gary    IN
#> 6 Joliet  IL
#> # ... with 11 more rows
suburbs[, 1]
#> # A tibble: 17 x 1
#>   city
#>   <chr>
#> 1 Chicago
#> 2 Kenosha
#> 3 Aurora
#> 4 Elgin
#> 5 Gary
#> 6 Joliet
#> # ... with 11 more rows
suburbs[, c(1, 4)]
#> # A tibble: 17 x 2
#>   city        pop
#>   <chr>     <dbl>
#> 1 Chicago 2853114
#> 2 Kenosha   90352
#> 3 Aurora   171782
#> 4 Elgin     94487
#> 5 Gary     102746
#> 6 Joliet   106221
#> # ... with 11 more rows
df[, vec]
df[, vec, drop = FALSE]
subset(df, select = colname)
subset(df, select = c(colname1, ..., colnameN))
subset(suburbs, subset = (pop > 100000))
#> # A tibble: 5 x 4
#>   city       county   state     pop
#>   <chr>      <chr>    <chr>   <dbl>
#> 1 Chicago    Cook     IL    2853114
#> 2 Aurora     Kane     IL     171782
#> 3 Gary       Lake(IN) IN     102746
#> 4 Joliet     Kendall  IL     106221
#> 5 Naperville DuPage   IL     147779
subset(suburbs, select = c(city, state, pop), subset = (pop > 100000))
#> # A tibble: 5 x 3
#>   city       state     pop
#>   <chr>      <chr>   <dbl>
#> 1 Chicago    IL    2853114
#> 2 Aurora     IL     171782
#> 3 Gary       IN     102746
#> 4 Joliet     IL     106221
#> 5 Naperville IL     147779
suburbs %>%
  dplyr::select(city, state, pop) %>%
  filter(pop > 100000)
#> # A tibble: 5 x 3
#>   city       state     pop
#>   <chr>      <chr>   <dbl>
#> 1 Chicago    IL    2853114
#> 2 Aurora     IL     171782
#> 3 Gary       IN     102746
#> 4 Joliet     IL     106221
#> 5 Naperville IL     147779
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#>     select
my_subset <- subset(Cars93, select = Model, subset = (MPG.city > 30))
head(my_subset)
#>      Model
#> 31 Festiva
#> 39   Metro
#> 42   Civic
#> 73  LeMans
#> 80   Justy
#> 83   Swift
Cars93 %>%
  filter(MPG.city > 30) %>%
  select(Model) %>%
  head()
#> Error in select(., Model): unused argument (Model)
Cars93 %>%
  filter(MPG.city > 30) %>%
  dplyr::select(Model) %>%
  head()
#>     Model
#> 1 Festiva
#> 2   Metro
#> 3   Civic
#> 4  LeMans
#> 5   Justy
#> 6   Swift
my_cars <- subset(Cars93,
  select = c(Model, Min.Price, Max.Price),
  subset = (Cylinders == 4 & Origin == "USA")
)
head(my_cars)
#>       Model Min.Price Max.Price
#> 6   Century      14.2      17.3
#> 12 Cavalier       8.5      18.3
#> 13  Corsica      11.4      11.4
#> 15   Lumina      13.4      18.4
#> 21  LeBaron      14.5      17.1
#> 23     Colt       7.9      10.6
Cars93 %>%
  filter(Cylinders == 4 & Origin == "USA") %>%
  dplyr::select(Model, Min.Price, Max.Price) %>%
  head()
#>      Model Min.Price Max.Price
#> 1  Century      14.2      17.3
#> 2 Cavalier       8.5      18.3
#> 3  Corsica      11.4      11.4
#> 4   Lumina      13.4      18.4
#> 5  LeBaron      14.5      17.1
#> 6     Colt       7.9      10.6
my_cars <- subset(Cars93,
  select = c(Manufacturer, Model),
  subset = c(MPG.highway > median(MPG.highway))
)
head(my_cars)
#>    Manufacturer    Model
#> 1         Acura  Integra
#> 5           BMW     535i
#> 6         Buick  Century
#> 12    Chevrolet Cavalier
#> 13    Chevrolet  Corsica
#> 15    Chevrolet   Lumina
Cars93 %>%
  filter(MPG.highway > median(MPG.highway)) %>%
  dplyr::select(Manufacturer, Model) %>%
  head()
#>   Manufacturer    Model
#> 1        Acura  Integra
#> 2          BMW     535i
#> 3        Buick  Century
#> 4    Chevrolet Cavalier
#> 5    Chevrolet  Corsica
#> 6    Chevrolet   Lumina
detach("package:MASS", unload = TRUE)
df <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
df
#>   V1 V2 V3
#> 1  1  4  7
#> 2  2  5  8
#> 3  3  6  9
colnames(df) <- c("tom", "dick", "harry") # a vector of character strings
df
#>   tom dick harry
#> 1   1    4     7
#> 2   2    5     8
#> 3   3    6     9
df <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
df %>%
  rename(tom = V1, dick = V2, harry = V3)
#>   tom dick harry
#> 1   1    4     7
#> 2   2    5     8
#> 3   3    6     9
mat <- matrix(rnorm(9), nrow = 3, ncol = 3)
mat
#>       [,1]    [,2]   [,3]
#> [1,] 0.701  0.0976  0.821
#> [2,] 0.388 -1.2755 -1.086
#> [3,] 1.968  1.2544  0.111
as.data.frame(mat)
#>      V1      V2     V3
#> 1 0.701  0.0976  0.821
#> 2 0.388 -1.2755 -1.086
#> 3 1.968  1.2544  0.111
lst <- list(1:3, c("a", "b", "c"), round(rnorm(3), 3))
lst
#> [[1]]
#> [1] 1 2 3
#>
#> [[2]]
#> [1] "a" "b" "c"
#>
#> [[3]]
#> [1] 0.181 0.773 0.983
as.data.frame(lst)
#>   X1.3 c..a....b....c.. c.0.181..0.773..0.983.
#> 1    1                a                  0.181
#> 2    2                b                  0.773
#> 3    3                c                  0.983
df <- as.data.frame(lst)
colnames(df) <- c("patient", "treatment", "value")
df
#>   patient treatment value
#> 1       1         a 0.181
#> 2       2         b 0.773
#> 3       3         c 0.983
df <- as.data.frame(lst)
df %>%
  rename(
    "patient" = !!names(.[1]),
    "treatment" = !!names(.[2]),
    "value" = !!names(.[3])
  )
#>   patient treatment value
#> 1       1         a 0.181
#> 2       2         b 0.773
#> 3       3         c 0.983
names(lst) <- c("patient", "treatment", "value")
as.data.frame(lst)
#>   patient treatment value
#> 1       1         a 0.181
#> 2       2         b 0.773
#> 3       3         c 0.983
df <- data.frame(my_data = c(NA, 1, NA, 2, NA, 3))
df
#>   my_data
#> 1      NA
#> 2       1
#> 3      NA
#> 4       2
#> 5      NA
#> 6       3
clean_df <- na.omit(df)
clean_df
#>   my_data
#> 2       1
#> 4       2
#> 6       3
df <- data.frame(
  x = c(NA, rnorm(4)),
  y = c(rnorm(2), NA, rnorm(2))
)
df
#>        x      y
#> 1     NA -0.836
#> 2  0.670 -0.922
#> 3 -1.421     NA
#> 4 -0.236 -1.123
#> 5 -0.975  0.372
cumsum(df)
#>    x      y
#> 1 NA -0.836
#> 2 NA -1.759
#> 3 NA     NA
#> 4 NA     NA
#> 5 NA     NA
cumsum(na.omit(df))
#>        x      y
#> 2  0.670 -0.922
#> 4  0.434 -2.046
#> 5 -0.541 -1.674
df <- data.frame(good = rnorm(3), meh = rnorm(3), bad = rnorm(3))
df
#>     good     meh    bad
#> 1  1.911 -0.7045 -1.575
#> 2  0.912  0.0608 -2.238
#> 3 -0.819  0.4424 -0.807
subset(df, select = -bad) # All columns except bad
#>     good     meh
#> 1  1.911 -0.7045
#> 2  0.912  0.0608
#> 3 -0.819  0.4424
df %>%
  dplyr::select(-bad)
#>     good     meh
#> 1  1.911 -0.7045
#> 2  0.912  0.0608
#> 3 -0.819  0.4424
id <- 1:10
pre <- rnorm(10)
dosage <- rnorm(10) + .3 * pre
post <- dosage * .5 * pre
patient_data <- data.frame(id = id, pre = pre, dosage = dosage, post = post)

cor(patient_data)
#>             id     pre  dosage    post
#> id      1.0000 -0.6934 -0.5075  0.0672
#> pre    -0.6934  1.0000  0.5830 -0.0919
#> dosage -0.5075  0.5830  1.0000  0.0878
#> post    0.0672 -0.0919  0.0878  1.0000
cor(subset(patient_data, select = -id))
#>            pre dosage    post
#> pre     1.0000 0.5830 -0.0919
#> dosage  0.5830 1.0000  0.0878
#> post   -0.0919 0.0878  1.0000
patient_data %>%
  dplyr::select(-id) %>%
  cor()
#>            pre dosage    post
#> pre     1.0000 0.5830 -0.0919
#> dosage  0.5830 1.0000  0.0878
#> post   -0.0919 0.0878  1.0000
## JDL Note... now that I've written all this I think the right thing to do is only show dplyr examples... one way to do things is better... fix in edit
cor(subset(patient_data, select = c(-id, -dosage)))
patient_data %>%
  dplyr::select(-id, -dosage) %>%
  cor()
#>          pre    post
#> pre   1.0000 -0.0919
#> post -0.0919  1.0000
df1 <- data_frame(a = rnorm(5))
df2 <- data_frame(b = rnorm(5))

all <- cbind(df1, df2)
all
#>         a       b
#> 1 -1.6357  1.3669
#> 2 -0.3662 -0.5432
#> 3  0.4445 -0.0158
#> 4  0.4945 -0.6960
#> 5  0.0934 -0.7334
df1 <- data_frame(x = rep("a", 2), y = rnorm(2))
df1
#> # A tibble: 2 x 2
#>   x         y
#>   <chr> <dbl>
#> 1 a     1.90
#> 2 a     0.440

df2 <- data_frame(x = rep("b", 2), y = rnorm(2))
df2
#> # A tibble: 2 x 2
#>   x         y
#>   <chr> <dbl>
#> 1 b     2.35
#> 2 b     0.188

rbind(df1, df2)
#> # A tibble: 4 x 2
#>   x         y
#>   <chr> <dbl>
#> 1 a     1.90
#> 2 a     0.440
#> 3 b     2.35
#> 4 b     0.188
df1 <- data_frame(x = rep("a", 2), y = rnorm(2))
df1
#> # A tibble: 2 x 2
#>   x          y
#>   <chr>  <dbl>
#> 1 a     -0.366
#> 2 a     -0.478

df2 <- data_frame(y = 1:2, x = c("b", "b"))
df2
#> # A tibble: 2 x 2
#>       y x
#>   <int> <chr>
#> 1     1 b
#> 2     2 b

rbind(df1, df2)
#> # A tibble: 4 x 2
#>   x          y
#>   <chr>  <dbl>
#> 1 a     -0.366
#> 2 a     -0.478
#> 3 b      1
#> 4 b      2
df1 <- data.frame(index = letters[1:5], val1 = rnorm(5))
df2 <- data.frame(index = letters[1:5], val2 = rnorm(5))

m <- merge(df1, df2, by = "index")
m
#>   index      val1   val2
#> 1     a -0.000837  1.178
#> 2     b -0.214967 -1.599
#> 3     c -1.399293  0.487
#> 4     d  0.010251 -1.688
#> 5     e -0.031463 -0.149
df1 %>%
  inner_join(df2)
#> Joining, by = "index"
#>   index      val1   val2
#> 1     a -0.000837  1.178
#> 2     b -0.214967 -1.599
#> 3     c -1.399293  0.487
#> 4     d  0.010251 -1.688
#> 5     e -0.031463 -0.149
born <- data.frame(
  name = c("Moe", "Larry", "Curly", "Harry"),
  year.born = c(1887, 1902, 1903, 1964),
  place.born = c("Bensonhurst", "Philadelphia", "Brooklyn", "Moscow")
)
died <- data.frame(
  name = c("Curly", "Moe", "Larry"),
  year.died = c(1952, 1975, 1975)
)
merge(born, died, by = "name")
#>    name year.born   place.born year.died
#> 1 Curly      1903     Brooklyn      1952
#> 2 Larry      1902 Philadelphia      1975
#> 3   Moe      1887  Bensonhurst      1975
born %>%
  inner_join(died)
#> Joining, by = "name"
#> Warning: Column `name` joining factors with different levels, coercing to
#> character vector
#>    name year.born   place.born year.died
#> 1   Moe      1887  Bensonhurst      1975
#> 2 Larry      1902 Philadelphia      1975
#> 3 Curly      1903     Brooklyn      1952
with(dataframe, expr)
z <- (suburbs$pop - mean(suburbs$pop)) / sd(suburbs$pop)
z
#>  [1]  3.875 -0.237 -0.116 -0.231 -0.219 -0.214 -0.152 -0.259 -0.266 -0.264
#> [11] -0.261 -0.248 -0.272 -0.260 -0.277 -0.236 -0.364
z <- with(suburbs, (pop - mean(pop)) / sd(pop))
z
#>  [1]  3.875 -0.237 -0.116 -0.231 -0.219 -0.214 -0.152 -0.259 -0.266 -0.264
#> [11] -0.261 -0.248 -0.272 -0.260 -0.277 -0.236 -0.364
suburbs %>%
  mutate(z = (pop - mean(pop)) / sd(pop))
#> # A tibble: 17 x 5
#>   city    county   state     pop      z
#>   <chr>   <chr>    <chr>   <dbl>  <dbl>
#> 1 Chicago Cook     IL    2853114  3.88
#> 2 Kenosha Kenosha  WI      90352 -0.237
#> 3 Aurora  Kane     IL     171782 -0.116
#> 4 Elgin   Kane     IL      94487 -0.231
#> 5 Gary    Lake(IN) IN     102746 -0.219
#> 6 Joliet  Kendall  IL     106221 -0.214
#> # ... with 11 more rows
as.numeric(" 3.14 ")
#> [1] 3.14
as.integer(3.14)
#> [1] 3
as.numeric("foo")
#> Warning: NAs introduced by coercion
#> [1] NA
as.character(101)
#> [1] "101"
as.numeric(c("1", "2.718", "7.389", "20.086"))
#> [1]  1.00  2.72  7.39 20.09
as.numeric(c("1", "2.718", "7.389", "20.086", "etc."))
#> Warning: NAs introduced by coercion
#> [1]  1.00  2.72  7.39 20.09    NA
as.character(101:105)
#> [1] "101" "102" "103" "104" "105"
as.numeric(FALSE)
#> [1] 0
as.numeric(TRUE)
#> [1] 1
logvec <- c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE)
sum(logvec) ## num true
#> [1] 4
length(logvec) - sum(logvec) ## num not true
#> [1] 2
library(tidyverse)

lst %>%
  map(fun)
library(tidyverse)

lst <- list(
  a = c(1,2,3),
  b = c(4,5,6)
)
lst %>%
  map(mean)
#> $a
#> [1] 2
#>
#> $b
#> [1] 5
fun <- function(x) {
  if (x > 1) {
    1
  } else {
    "Less Than 1"
  }
}

fun(5)
#> [1] 1
fun(0.5)
#> [1] "Less Than 1"
lst <- list(.5, 1.5, .9, 2)

map(lst, fun)
#> [[1]]
#> [1] "Less Than 1"
#>
#> [[2]]
#> [1] 1
#>
#> [[3]]
#> [1] "Less Than 1"
#>
#> [[4]]
#> [1] 1
map_chr(lst, fun)
#> [1] "Less Than 1" "1.000000"    "Less Than 1" "1.000000"

## or using pipes
lst %>%
  map_chr(fun)
#> [1] "Less Than 1" "1.000000"    "Less Than 1" "1.000000"
map_dbl(lst, fun)
#> Error: Can't coerce element 1 from a character to a double
fun <- function(a, b, c) {
  # calculate the sum of a sequence from a to b by c
  sum(seq(a, b, c))
}

df <- data.frame(mn = c(1, 2, 3),
                 mx = c(8, 13, 18),
                 rng = c(1, 2, 3))

df %>%
  mutate(output =
           pmap_dbl(list(a = mn, b = mx, c = rng), fun))
#>   mn mx rng output
#> 1  1  8   1     36
#> 2  2 13   2     42
#> 3  3 18   3     63
pmap(list(a = df$mn, b = df$mx, c = df$rng), fun)
#> [[1]]
#> [1] 36
#>
#> [[2]]
#> [1] 42
#>
#> [[3]]
#> [1] 63
results <- apply(mat, 1, fun)    # mat is a matrix, fun is a function
long <- matrix(1:15, 3, 5)
long
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    1    4    7   10   13
#> [2,]    2    5    8   11   14
#> [3,]    3    6    9   12   15
apply(long, 1, mean)
#> [1] 7 8 9
rownames(long) <- c("Moe", "Larry", "Curly")
apply(long, 1, mean)
#>   Moe Larry Curly
#>     7     8     9
apply(long, 1, range)
#>      Moe Larry Curly
#> [1,]   1     2     3
#> [2,]  13    14    15
apply(mat, 2, fun)
mat <- matrix(c(1, 3, 2, 5, 4, 6), 2, 3)
colnames(mat) <- c("t1", "t2", "t3")
mat
#>      t1 t2 t3
#> [1,]  1  2  4
#> [2,]  3  5  6

apply(mat, 2, mean)  # Compute the mean of every column
#>  t1  t2  t3
#> 2.0 3.5 5.0
df2 <- map_df(df, fun) # Returns a data.frame
load("./data/batches.rdata")
head(batches)
#>   batch clinic dosage shrinkage
#> 1     3     KY     IL    -0.307
#> 2     3     IL     IL    -1.781
#> 3     1     KY     IL    -0.172
#> 4     3     KY     IL     1.215
#> 5     2     IL     IL     1.895
#> 6     2     NJ     IL    -0.430
map_df(batches, class)
#> # A tibble: 1 x 4
#>   batch  clinic dosage shrinkage
#>   <chr>  <chr>  <chr>  <chr>
#> 1 factor factor factor numeric
lst <- list(v1, v2, v3)
pmap(lst, fun)
map2(v1, v2, fun)
map2_dbl(v1, v2, fun)
gcd <- function(a, b) {
  if (b == 0) {
    return(a)
  } else {
    return(gcd(b, a %% b))
  }
}
gcd(c(1, 2, 3), c(9, 6, 3))
#> Warning in if (b == 0) {: the condition has length > 1 and only the first
#> element will be used

#> Warning in if (b == 0) {: the condition has length > 1 and only the first
#> element will be used

#> Warning in if (b == 0) {: the condition has length > 1 and only the first
#> element will be used
#> [1] 1 2 0
a <- c(1, 2, 3)
b <- c(9, 6, 3)
my_gcds <- map2(a, b, gcd)
my_gcds
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> [1] 2
#>
#> [[3]]
#> [1] 3
unlist(my_gcds)
#> [1] 1 2 3
map2_chr(a, b, gcd)
#> [1] "1.000000" "2.000000" "3.000000"
map2_dbl(a, b, gcd)
#> [1] 1 2 3
lst <- list(a,b)
pmap(lst, gcd)
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> [1] 2
#>
#> [[3]]
#> [1] 3
lst <- list(a,b)
pmap_dbl(lst, gcd)
#> [1] 1 2 3
df %>%
  group_by(v1, v2) %>%
  summarize(
    result_var = fun(value_var)
  )
df <- tibble(
  my_group = c("A", "B","A", "B","A", "B"),
  values = 1:6
)

df %>%
  group_by(my_group) %>%
  summarize(
    avg_values = mean(values),
    tot_values = sum(values),
    count_values = n()
  )
#> # A tibble: 2 x 4
#>   my_group avg_values tot_values count_values
#>   <chr>         <dbl>      <int>        <int>
#> 1 A                 3          9            3
#> 2 B                 4         12            3
nchar("Moe")
#> [1] 3
nchar("Curly")
#> [1] 5
s <- c("Moe", "Larry", "Curly")
nchar(s)
#> [1] 3 5 5
length("Moe")
#> [1] 1
length(c("Moe", "Larry", "Curly"))
#> [1] 3
paste("Everybody", "loves", "stats.")
#> [1] "Everybody loves stats."
paste("Everybody", "loves", "stats.", sep = "-")
#> [1] "Everybody-loves-stats."
paste("Everybody", "loves", "stats.", sep = "")
#> [1] "Everybodylovesstats."
paste0("Everybody", "loves", "stats.")
#> [1] "Everybodylovesstats."
paste("The square root of twice pi is approximately", sqrt(2 * pi))
#> [1] "The square root of twice pi is approximately 2.506628274631"
stooges <- c("Moe", "Larry", "Curly")
paste(stooges, "loves", "stats.")
#> [1] "Moe loves stats."   "Larry loves stats." "Curly loves stats."
paste(stooges, "loves", "stats", collapse = ", and ")
#> [1] "Moe loves stats, and Larry loves stats, and Curly loves stats"
substr("Statistics", 1, 4) # Extract first 4 characters
#> [1] "Stat"
substr("Statistics", 7, 10) # Extract last 4 characters
#> [1] "tics"
ss <- c("Moe", "Larry", "Curly")
substr(ss, 1, 3) # Extract first 3 characters of each string
#> [1] "Moe" "Lar" "Cur"
cities <- c("New York, NY", "Los Angeles, CA", "Peoria, IL")
substr(cities, nchar(cities) - 1, nchar(cities))
#> [1] "NY" "CA" "IL"
strsplit(string, delimiter)
path <- "/home/mike/data/trials.csv"
strsplit(path, "/")
#> [[1]]
#> [1] ""           "home"       "mike"       "data"       "trials.csv"
strsplit(path, "/")[[1]]
#> [1] ""           "home"       "mike"       "data"       "trials.csv"
paths <- c(
  "/home/mike/data/trials.csv",
  "/home/mike/data/errors.csv",
  "/home/mike/corr/reject.doc"
)
strsplit(paths, "/")
#> [[1]]
#> [1] ""           "home"       "mike"       "data"       "trials.csv"
#>
#> [[2]]
#> [1] ""           "home"       "mike"       "data"       "errors.csv"
#>
#> [[3]]
#> [1] ""           "home"       "mike"       "corr"       "reject.doc"
sub(old, new, string)
gsub(old, new, string)
str <- "Curly is the smart one. Curly is funny, too."
sub("Curly", "Moe", str)
#> [1] "Moe is the smart one. Curly is funny, too."
gsub("Curly", "Moe", str)
#> [1] "Moe is the smart one. Moe is funny, too."
sub(" and SAS", "", "For really tough problems, you need R and SAS.")
#> [1] "For really tough problems, you need R."
m <- outer(strings1, strings2, paste, sep = "")
locations <- c("NY", "LA", "CHI", "HOU")
treatments <- c("T1", "T2", "T3")
outer(locations, treatments, paste, sep = "-")
#>      [,1]     [,2]     [,3]
#> [1,] "NY-T1"  "NY-T2"  "NY-T3"
#> [2,] "LA-T1"  "LA-T2"  "LA-T3"
#> [3,] "CHI-T1" "CHI-T2" "CHI-T3"
#> [4,] "HOU-T1" "HOU-T2" "HOU-T3"
outer(treatments, treatments, paste, sep = "-")
#>      [,1]    [,2]    [,3]
#> [1,] "T1-T1" "T1-T2" "T1-T3"
#> [2,] "T2-T1" "T2-T2" "T2-T3"
#> [3,] "T3-T1" "T3-T2" "T3-T3"
expand.grid(treatments, treatments)
#>   Var1 Var2
#> 1   T1   T1
#> 2   T2   T1
#> 3   T3   T1
#> 4   T1   T2
#> 5   T2   T2
#> 6   T3   T2
#> 7   T1   T3
#> 8   T2   T3
#> 9   T3   T3
m <- outer(treatments, treatments, paste, sep = "-")
m[!lower.tri(m)]
#> [1] "T1-T1" "T1-T2" "T2-T2" "T1-T3" "T2-T3" "T3-T3"
Sys.Date()
#> [1] "2019-01-07"
class(Sys.Date())
#> [1] "Date"
as.Date("2018-12-31")
#> [1] "2018-12-31"
as.Date("12/31/2018")
#> Error in charToDate(x): character string is not in a standard unambiguous format
as.Date("12/31/2018", format = "%m/%d/%Y")
#> [1] "2018-12-31"
format(Sys.Date())
#> [1] "2019-01-07"
as.character(Sys.Date())
#> [1] "2019-01-07"
format(Sys.Date(), format = "%m/%d/%Y")
#> [1] "01/07/2019"
ISOdate(year, month, day)
year <- 2018
month <- 12
day <- 31
as.Date(ISOdate(year, month, day))
#> [1] "2018-12-31"
ISOdate(2020, 2, 29)
#> [1] "2020-02-29 12:00:00 GMT"
as.Date(ISOdate(2020, 2, 29))
#> [1] "2020-02-29"
ISOdate(2013, 2, 29) # Oops! 2013 is not a leap year
#> [1] NA
years <- 2010:2014
months <- rep(1, 5)
days <- 5:9
ISOdate(years, months, days)
#> [1] "2010-01-05 12:00:00 GMT" "2011-01-06 12:00:00 GMT"
#> [3] "2012-01-07 12:00:00 GMT" "2013-01-08 12:00:00 GMT"
#> [5] "2014-01-09 12:00:00 GMT"
as.Date(ISOdate(years, months, days))
#> [1] "2010-01-05" "2011-01-06" "2012-01-07" "2013-01-08" "2014-01-09"
as.Date(ISOdate(years, 1, days))
#> [1] "2010-01-05" "2011-01-06" "2012-01-07" "2013-01-08" "2014-01-09"
ISOdatetime(year, month, day, hour, minute, second)
d <- as.Date("2019-03-15")
as.integer(d)
#> [1] 17970
jd <- julian(d)
jd
#> [1] 17970
#> attr(,"origin")
#> [1] "1970-01-01"
attr(jd, "origin")
#> [1] "1970-01-01"
as.integer(as.Date("1970-01-01"))
#> [1] 0
as.integer(as.Date("1970-01-02"))
#> [1] 1
as.integer(as.Date("1970-01-03"))
#> [1] 2
d <- as.Date("2019-03-15")
p <- as.POSIXlt(d)
p$mday        # Day of the month
#> [1] 15
p$mon         # Month (0 = January)
#> [1] 2
p$year + 1900 # Year
#> [1] 2019
d <- as.Date("2020-04-02")
as.POSIXlt(d)$wday
#> [1] 4
as.POSIXlt(d)$yday
#> [1] 92
as.POSIXlt(d)$year # Oops!
#> [1] 120
as.POSIXlt(d)$year + 1900
#> [1] 2020
s <- as.Date("2019-01-01")
e <- as.Date("2019-02-01")
seq(from = s, to = e, by = 1) # One month of dates
#>  [1] "2019-01-01" "2019-01-02" "2019-01-03" "2019-01-04" "2019-01-05"
#>  [6] "2019-01-06" "2019-01-07" "2019-01-08" "2019-01-09" "2019-01-10"
#> [11] "2019-01-11" "2019-01-12" "2019-01-13" "2019-01-14" "2019-01-15"
#> [16] "2019-01-16" "2019-01-17" "2019-01-18" "2019-01-19" "2019-01-20"
#> [21] "2019-01-21" "2019-01-22" "2019-01-23" "2019-01-24" "2019-01-25"
#> [26] "2019-01-26" "2019-01-27" "2019-01-28" "2019-01-29" "2019-01-30"
#> [31] "2019-01-31" "2019-02-01"
seq(from = s, by = 1, length.out = 7) # Dates, one week apart
#> [1] "2019-01-01" "2019-01-02" "2019-01-03" "2019-01-04" "2019-01-05"
#> [6] "2019-01-06" "2019-01-07"
seq(from = s, by = "month", length.out = 12)   # First of the month for one year
#>  [1] "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" "2019-05-01"
#>  [6] "2019-06-01" "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01"
#> [11] "2019-11-01" "2019-12-01"
seq(from = s, by = "3 months", length.out = 4) # Quarterly dates for one year
#> [1] "2019-01-01" "2019-04-01" "2019-07-01" "2019-10-01"
seq(from = s, by = "year", length.out = 10)    # Year-start dates for one decade
#>  [1] "2019-01-01" "2020-01-01" "2021-01-01" "2022-01-01" "2023-01-01"
#>  [6] "2024-01-01" "2025-01-01" "2026-01-01" "2027-01-01" "2028-01-01"
seq(as.Date("2019-01-29"), by = "month", len = 3)
#> [1] "2019-01-29" "2019-03-01" "2019-03-29"
?Normal
?TDist
n <- 10
k <- 2
choose(n, k)
#> [1] 45
choose(5, 3)   # How many ways can we select 3 items from 5 items?
#> [1] 10
choose(50, 3)  # How many ways can we select 3 items from 50 items?
#> [1] 19600
choose(50, 30) # How many ways can we select 30 items from 50 items?
#> [1] 4.71e+13
items <- 2:5
k <- 2
combn(items, k)
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    2    2    2    3    3    4
#> [2,]    3    4    5    4    5    5
combn(1:5, 3)
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    1    1    1    1    1    1    2    2    2     3
#> [2,]    2    2    2    3    3    4    3    3    4     4
#> [3,]    3    4    5    4    5    5    4    5    5     5
combn(c("T1", "T2", "T3", "T4", "T5"), 3)
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,] "T1" "T1" "T1" "T1" "T1" "T1" "T2" "T2" "T2" "T3"
#> [2,] "T2" "T2" "T2" "T3" "T3" "T4" "T3" "T3" "T4" "T4"
#> [3,] "T3" "T4" "T5" "T4" "T5" "T5" "T4" "T5" "T5" "T5"
runif(1)
#> [1] 0.915
rnorm(1)
#> [1] 1.53
runif(1)
#> [1] 0.83
runif(10)
#>  [1] 0.642 0.519 0.737 0.135 0.657 0.705 0.458 0.719 0.935 0.255
set.seed(42)
runif(1, min = -3, max = 3)      # One uniform variate between -3 and +3
#> [1] 2.49
rnorm(1)                         # One standard Normal variate
#> [1] 1.53
rnorm(1, mean = 100, sd = 15)    # One Normal variate, mean 100 and SD 15
#> [1] 114
rbinom(1, size = 10, prob = 0.5) # One binomial variate
#> [1] 5
rpois(1, lambda = 10)            # One Poisson variate
#> [1] 12
rexp(1, rate = 0.1)              # One exponential variate
#> [1] 3.14
rgamma(1, shape = 2, rate = 0.1) # One gamma variate
#> [1] 22.3
rnorm(3, mean = c(-10, 0, +10), sd = 1)
#> [1] -9.420 -0.658 11.555
means <- rnorm(30, mean = 0, sd = 0.2)
rnorm(30, mean = means, sd = 1)
#>  [1] -0.5549 -2.9232 -1.2203  0.6962  0.1673 -1.0779 -0.3138 -3.3165
#>  [9]  1.5952  0.8184 -0.1251  0.3601 -0.8142  0.1050  2.1264  0.6943
#> [17] -2.7771  0.9026  0.0389  0.2280 -0.5599  0.9572  0.1972  0.2602
#> [25] -0.4423  1.9707  0.4553  0.0467  1.5229  0.3176
set.seed(42) # Or use any other positive integer...
set.seed(165)   # Initialize generator to known state
runif(10)       # Generate ten random numbers
#>  [1] 0.116 0.450 0.996 0.611 0.616 0.426 0.666 0.168 0.788 0.442

set.seed(165)   # Reinitialize to the same known state
runif(10)       # Generate the same ten "random" numbers
#>  [1] 0.116 0.450 0.996 0.611 0.616 0.426 0.666 0.168 0.788 0.442
sample(set, n)
world_series <- read_csv("./data/world_series.csv")
sample(world_series$year, 10)
#>  [1] 2010 1961 1906 1992 1982 1948 1910 1973 1967 1931
sample(world_series$year, 10)
#>  [1] 1941 1973 1921 1958 1979 1946 1932 1919 1971 1974
set.seed(42)
x <- rnorm(1000, 4, 10)
medians <- numeric(1000)   # empty vector of 1000 numbers
for (i in 1:1000) {
  medians[i] <- median(sample(x, replace = TRUE))
}
ci <- quantile(medians, c(0.025, 0.975))
cat("95% confidence interval is (", ci, ")\n")
#> 95% confidence interval is ( 3.16 4.49 )
sample(set, n, replace = TRUE)
sample(c("H", "T"), 10, replace = TRUE)
#>  [1] "H" "T" "H" "T" "T" "T" "H" "T" "T" "H"
sample(c(FALSE, TRUE), 20, replace = TRUE)
#>  [1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> [12]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
sample(c(FALSE, TRUE), 20, replace = TRUE, prob = c(0.2, 0.8))
#>  [1]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#> [12]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
rbinom(10, 1, 0.8)
#>  [1] 1 0 1 1 1 1 1 0 1 1
sample(v, size = length(v), replace = FALSE)
sample(1:10)
#>  [1]  7  3  6  1  5  2  4  8 10  9
dbinom(7, size = 10, prob = 0.5)
#> [1] 0.117
pbinom(7, size = 10, prob = 0.5)
#> [1] 0.945
pbinom(7, size = 10, prob = 0.5, lower.tail = FALSE)
#> [1] 0.0547
pbinom(7, size = 10, prob = 0.5) - pbinom(3, size = 10, prob = 0.5)
#> [1] 0.773
pbinom(c(3, 7), size = 10, prob = 0.5)
#> [1] 0.172 0.945
diff(pbinom(c(3, 7), size = 10, prob = 0.5))
#> [1] 0.773
pnorm(q = .8, mean = 0, sd = 1)
#> [1] 0.788
pnorm(66, mean = 70, sd = 3)
#> [1] 0.0912
pexp(20, rate = 1 / 40)
#> [1] 0.393
pexp(50, rate = 1 / 40, lower.tail = FALSE)
#> [1] 0.287
pexp(50, rate = 1 / 40) - pexp(20, rate = 1 / 40)
#> [1] 0.32
qnorm(0.05, mean = 100, sd = 15)
#> [1] 75.3
qnorm(0.025)
#> [1] -1.96
qnorm(0.975)
#> [1] 1.96
qnorm(c(0.025, 0.975))
#> [1] -1.96  1.96
dens <- data.frame(x = x,
                   y = d_____(x))
ggplot(dens, aes(x, y)) + geom_line()
library(ggplot2)

x <- seq(-3, +3, 0.1)
dens <- data.frame(x = x, y = dnorm(x))

ggplot(dens, aes(x, y)) + geom_line()
x <- seq(from = 0, to = 6, length.out = 100) # Define the density domains
ylim <- c(0, 0.6)

# Make a data.frame with densities of several distributions
df <- rbind(
  data.frame(x = x, dist_name = "Uniform"    , y = dunif(x, min   = 2, max = 4)),
  data.frame(x = x, dist_name = "Normal"     , y = dnorm(x, mean  = 3, sd = 1)),
  data.frame(x = x, dist_name = "Exponential", y = dexp(x, rate  = 1 / 2)),
  data.frame(x = x, dist_name = "Gamma"      , y = dgamma(x, shape = 2, rate = 1)) )

# Make a line plot like before, but use facet_wrap to create the grid
ggplot(data = df, aes(x = x, y = y)) +
  geom_line() +
  facet_wrap(~dist_name)   # facet and wrap by the variable dist_name
x <- seq(from = -3, to = 3, length.out = 100)
df <- data.frame(x = x, y = dnorm(x, mean = 0, sd = 1))

p <- ggplot(df, aes(x, y)) +
  geom_line() +
  labs(
    title = "Standard Normal Distribution",
    y = "Density",
    x = "Quantile"
  )
p
q75 <- quantile(df$x, .75)
q95 <- quantile(df$x, .95)

p +
  geom_ribbon(
    data = subset(df, x > q75 & x < q95),
    aes(ymax = y),
    ymin = 0,
    fill = "blue",
    colour = NA,
    alpha = 0.5
  )
summary(vec)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#>     0.0     0.5     1.0     1.6     1.9    33.0
summary(mat)
#>      Samp1           Samp2            Samp3
#>  Min.   :  1.0   Min.   :-2.943   Min.   : 0.04
#>  1st Qu.: 25.8   1st Qu.:-0.774   1st Qu.: 0.39
#>  Median : 50.5   Median :-0.052   Median : 0.85
#>  Mean   : 50.5   Mean   :-0.067   Mean   : 1.60
#>  3rd Qu.: 75.2   3rd Qu.: 0.684   3rd Qu.: 2.12
#>  Max.   :100.0   Max.   : 2.150   Max.   :13.18
summary(fac)
#> Maybe    No   Yes
#>    38    32    30
summary(char)
#>    Length     Class      Mode
#>       100 character character
suburbs <- read_csv("./data/suburbs.txt")
summary(suburbs)
#>      city              county             state
#>  Length:17          Length:17          Length:17
#>  Class :character   Class :character   Class :character
#>  Mode  :character   Mode  :character   Mode  :character
#>
#>
#>
#>       pop
#>  Min.   :   5428
#>  1st Qu.:  72616
#>  Median :  83048
#>  Mean   : 249770
#>  3rd Qu.: 102746
#>  Max.   :2853114
summary(vec_list)
#>   Length Class  Mode
#> x 100    -none- numeric
#> y 100    -none- numeric
#> z 100    -none- character
library(purrr)
map(vec_list, summary)
#> $x
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#>  -2.572  -0.686  -0.084  -0.043   0.660   2.413
#>
#> $y
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#>  -1.752  -0.589   0.045   0.079   0.769   2.293
#>
#> $z
#>    Length     Class      Mode
#>       100 character character
mean(x > 3)
#> [1] 0.12
table(f1)
#> f1
#>  a  b  c  d  e
#> 14 23 24 21 18
table(f1, f2)
#>    f2
#> f1   f  g  h
#>   a  6  4  4
#>   b  7  9  7
#>   c  4 11  9
#>   d  7  8  6
#>   e  5 10  3
t1 <- sample(letters[9:11], 100, replace = TRUE)
table(t1)
#> t1
#>  i  j  k
#> 20 40 40
set.seed(42)
initial <- factor(sample(c("Yes", "No", "Maybe"), 100, replace = TRUE))
outcome <- factor(sample(c("Pass", "Fail"), 100, replace = TRUE))

table(initial)
#> initial
#> Maybe    No   Yes
#>    39    31    30

table(outcome)
#> outcome
#> Fail Pass
#>   56   44
table(initial, outcome)
#>        outcome
#> initial Fail Pass
#>   Maybe   23   16
#>   No      20   11
#>   Yes     13   17
summary(table(initial, outcome))
#> Number of cases in table: 100
#> Number of factors: 2
#> Test for independence of all factors:
#>  Chisq = 3, df = 2, p-value = 0.2
summary(table(initial, outcome))
#> Number of cases in table: 100
#> Number of factors: 2
#> Test for independence of all factors:
#>  Chisq = 3, df = 2, p-value = 0.2
quantile(vec, 0.95)
#>  95%
#> 1.43
quantile(vec)
#>      0%     25%     50%     75%    100%
#> -2.0247 -0.5915 -0.0693  0.4618  2.7019
vec <- runif(1000)
quantile(vec, .05)
#>     5%
#> 0.0451
quantile(vec, c(.05, .95))
#>     5%    95%
#> 0.0451 0.9363
quantile(vec)
#>       0%      25%      50%      75%     100%
#> 0.000405 0.235529 0.479543 0.737619 0.999379
mean(vec < 1.6)
#> [1] 0.948
scale(x)
#>          [,1]
#>  [1,]  0.8701
#>  [2,] -0.7133
#>  [3,] -1.0503
#>  [4,]  0.5790
#>  [5,] -0.6324
#>  [6,]  0.0991
#>  [7,]  2.1495
#>  [8,]  0.2481
#>  [9,] -0.8155
#> [10,] -0.7341
#> attr(,"scaled:center")
#> [1] 2.42
#> attr(,"scaled:scale")
#> [1] 2.11
(y - mean(x)) / sd(x)
#> [1] -0.633
t.test(x, mu = m)
x <- rnorm(75, mean = 100, sd = 15)
t.test(x, mu = 95)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = 3, df = 70, p-value = 0.005
#> alternative hypothesis: true mean is not equal to 95
#> 95 percent confidence interval:
#>   96.5 103.0
#> sample estimates:
#> mean of x
#>      99.7
t.test(x, mu = 100)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = -0.2, df = 70, p-value = 0.9
#> alternative hypothesis: true mean is not equal to 100
#> 95 percent confidence interval:
#>   96.5 103.0
#> sample estimates:
#> mean of x
#>      99.7
t.test(x)
t.test(x)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = 50, df = 50, p-value <2e-16
#> alternative hypothesis: true mean is not equal to 0
#> 95 percent confidence interval:
#>   94.2 101.5
#> sample estimates:
#> mean of x
#>      97.9
t.test(x, conf.level = 0.99)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = 50, df = 50, p-value <2e-16
#> alternative hypothesis: true mean is not equal to 0
#> 99 percent confidence interval:
#>   92.9 102.8
#> sample estimates:
#> mean of x
#>      97.9
wilcox.test(x, conf.int = TRUE)
wilcox.test(x, conf.int = TRUE)
#>
#>  Wilcoxon signed rank test
#>
#> data:  x
#> V = 200, p-value = 0.1
#> alternative hypothesis: true location is not equal to 0
#> 95 percent confidence interval:
#>  -0.102  0.646
#> sample estimates:
#> (pseudo)median
#>          0.311
median(x)
#> [1] 0.314
prop.test(x, n, p)
prop.test(11, 20, 0.5, alternative = "greater")
#>
#>  1-sample proportions test with continuity correction
#>
#> data:  11 out of 20, null probability 0.5
#> X-squared = 0.05, df = 1, p-value = 0.4
#> alternative hypothesis: true p is greater than 0.5
#> 95 percent confidence interval:
#>  0.35 1.00
#> sample estimates:
#>    p
#> 0.55
prop.test(x, n)
prop.test(6, 9)
#> Warning in prop.test(6, 9): Chi-squared approximation may be incorrect
#>
#>  1-sample proportions test with continuity correction
#>
#> data:  6 out of 9, null probability 0.5
#> X-squared = 0.4, df = 1, p-value = 0.5
#> alternative hypothesis: true p is not equal to 0.5
#> 95 percent confidence interval:
#>  0.309 0.910
#> sample estimates:
#>     p
#> 0.667
prop.test(x, n, p, conf.level = 0.99)   # 99% confidence level
shapiro.test(x)
shapiro.test(x)
#>
#>  Shapiro-Wilk normality test
#>
#> data:  x
#> W = 1, p-value = 0.05
shapiro.test(y)
#>
#>  Shapiro-Wilk normality test
#>
#> data:  y
#> W = 0.7, p-value = 9e-12
library(tseries)
runs.test(as.factor(s))
s <- sample(c(0, 1), 100, replace = T)
runs.test(as.factor(s))
#>
#>  Runs Test
#>
#> data:  as.factor(s)
#> Standard Normal = 0.1, p-value = 0.9
#> alternative hypothesis: two.sided
s <- c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0)
runs.test(as.factor(s))
#>
#>  Runs Test
#>
#> data:  as.factor(s)
#> Standard Normal = -2, p-value = 0.02
#> alternative hypothesis: two.sided
t.test(x, y)
t.test(x, y, paired = TRUE)
load("./data/sat.rdata")
t.test(x, y)
#>
#>  Welch Two Sample t-test
#>
#> data:  x and y
#> t = -1, df = 200, p-value = 0.3
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -46.4  16.2
#> sample estimates:
#> mean of x mean of y
#>      1054      1069
t.test(x, y, paired = TRUE)
#>
#>  Paired t-test
#>
#> data:  x and y
#> t = -20, df = 100, p-value <2e-16
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -16.8 -13.5
#> sample estimates:
#> mean of the differences
#>                   -15.1
wilcox.test(x, y, paired = TRUE)
wilcox.test(x, y)
load(file = "./data/workers.rdata")
wilcox.test(fav, unfav, paired = TRUE)
#>
#>  Wilcoxon signed rank test
#>
#> data:  fav and unfav
#> V = 10, p-value = 1e-04
#> alternative hypothesis: true location shift is not equal to 0
cor.test(x, y)
cor.test(x, y, method = "spearman")
cor(x, y)
#> [1] 0.751
cor.test(x, y)
#>
#>  Pearson's product-moment correlation
#>
#> data:  x and y
#> t = 2, df = 4, p-value = 0.09
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  -0.155  0.971
#> sample estimates:
#>   cor
#> 0.751
#>
#>  2-sample test for equality of proportions with continuity
#>  correction
#>
#> data:  ns out of nt
#> X-squared = 5, df = 1, p-value = 0.03
#> alternative hypothesis: two.sided
#> 95 percent confidence interval:
#>  -0.3058 -0.0142
#> sample estimates:
#> prop 1 prop 2
#>   0.48   0.64
ns <- c(48, 64)
nt <- c(100, 100)
prop.test(ns, nt)
successes <- c(14, 10)
trials <- c(38, 40)
prop.test(successes, trials)
#>
#>  2-sample test for equality of proportions with continuity
#>  correction
#>
#> data:  successes out of trials
#> X-squared = 0.8, df = 1, p-value = 0.4
#> alternative hypothesis: two.sided
#> 95 percent confidence interval:
#>  -0.111  0.348
#> sample estimates:
#> prop 1 prop 2
#>  0.368  0.250
pairwise.t.test(x, f)   # x is the data, f is the grouping factor
pairwise.t.test(comb$values, comb$ind)
#>
#>  Pairwise comparisons using t tests with pooled SD
#>
#> data:  comb$values and comb$ind
#>
#>      fresh soph
#> soph 0.001 -
#> jrs  3e-04 0.592
#>
#> P value adjustment method: holm
ks.test(x, y)
#>
#>  Two-sample Kolmogorov-Smirnov test
#>
#> data:  x and y
#> D = 0.2, p-value = 0.04
#> alternative hypothesis: two-sided
ks.test(x, y)
#>
#>  Two-sample Kolmogorov-Smirnov test
#>
#> data:  x and y
#> D = 0.2, p-value = 0.04
#> alternative hypothesis: two-sided
z <- rnorm(100, mean = 4, sd = 6)
ks.test(x, z)
#>
#>  Two-sample Kolmogorov-Smirnov test
#>
#> data:  x and z
#> D = 0.1, p-value = 0.6
#> alternative hypothesis: two-sided
library(tidyverse)
df <- data.frame(x = 1:5, y = 1:5)
ggplot(df, aes(x, y)) +
  geom_point()
ggplot(df, aes(x, y)) +
  geom_point() +
  labs(
    title = "Simple Plot Example",
    subtitle = "with a subtitle",
    x = "x values",
    y = "y values"
  ) +
  theme(panel.background = element_rect(fill = "white", colour = "grey50"))
ggplot(df, aes(x, y)) +
  geom_point()
ggplot(mtcars, aes(hp, mpg)) +
  geom_point()
ggplot(df, aes(x, y)) +
  geom_point() +
  labs(title = "The Title",
       x = "X-axis Label",
       y = "Y-axis Label")
ggplot(mtcars, aes(hp, mpg)) +
  geom_point() +
  labs(title = "Cars: Horsepower vs. Fuel Economy",
       x = "HP",
       y = "Economy (miles per gallon)")
ggplot(df) +
  geom_point(aes(x, y)) +
  theme(panel.background = element_rect(fill = "white", colour = "grey50"))
g1 <- ggplot(mtcars, aes(hp, mpg)) +
  geom_point() +
  labs(title = "Cars: Horsepower vs. Fuel Economy",
       x = "HP",
       y = "Economy (miles per gallon)") +
  theme(panel.background = element_blank())
g1
g2 <- g1 + theme(panel.grid.major =
                   element_line(color = "red", linetype = 3)) +
  # linetype = 3 is dash
  theme(panel.grid.minor =
          element_line(color = "blue", linetype = 4))
  # linetype = 4 is dot dash
g2
g1 +
  theme(panel.grid.major = element_line(colour = "grey"))
ggplot(df, aes(x, y, shape = f)) +
  geom_point()
ggplot(data = iris,
       aes(x = Petal.Length,
           y = Petal.Width)) +
  geom_point()
ggplot(data = iris,
       aes(
         x = Petal.Length,
         y = Petal.Width,
         shape = Species,
         color = Species
       )) +
  geom_point()
g <- ggplot(data = iris,
       aes(x = Petal.Length,
           y = Petal.Width,
           shape="Point Name")) +
  geom_point()  +
  guides(shape=guide_legend(title="Legend Title"))
g
g <- ggplot(data = iris,
            aes(
              x = Petal.Length,
              y = Petal.Width,
              shape = Species,
              color = Species
            )) +
  geom_point() +
  theme(legend.position = "none")
g
g + theme(legend.position = "bottom")
g + theme(legend.position = c(.8, .2))
ggplot(df, aes(x, y)) +
  geom_point() +
  geom_smooth(method = "lm",
              formula = y ~ x,
              se = FALSE)
library(faraway)
data(strongx)

ggplot(strongx, aes(energy, crossx)) +
  geom_point()
g <- ggplot(strongx, aes(energy, crossx)) +
  geom_point()

g + geom_smooth(method = "lm",
                formula = y ~ x,
                se = FALSE)
g + geom_smooth(method = "lm",
                formula = y ~ x)
m <- lm(crossx ~ energy, data = strongx)

ggplot(strongx, aes(energy, crossx)) +
  geom_point() +
  geom_abline(
    intercept = m$coefficients[1],
    slope = m$coefficients[2]
  )
library(GGally)
ggpairs(df)
head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa
library(GGally)
ggpairs(iris)
plot(iris)
ggplot(df, aes(x, y)) +
  geom_point() +
  facet_wrap( ~ f)
data(Cars93, package = "MASS")
ggplot(data = Cars93, aes(MPG.city, Horsepower)) +
  geom_point() +
  facet_wrap( ~ Origin)
ggplot(data = df, aes(x, y)) +
  geom_bar(stat = "identity")
ford_cars <- Cars93 %>%
  filter(Manufacturer == "Ford")

ggplot(ford_cars, aes(Model, Horsepower)) +
  geom_bar(stat = "identity")
ggplot(airquality, aes(month.abb[Month], Temp)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
aq_data <- airquality %>%
  arrange(Month) %>%
  mutate(month_abb = fct_inorder(month.abb[Month]))

ggplot(aq_data, aes(month_abb, Temp)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
ggplot(df, aes(group, stat)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = .2)
aq_data <- airquality %>%
  arrange(Month) %>%
  mutate(month_abb = fct_inorder(month.abb[Month]))
ggplot(aq_data, aes(month_abb, Temp)) +
  geom_bar(stat = "summary",
           fun.y = "mean",
           fill = "cornflowerblue") +
  stat_summary(fun.data = mean_se, geom = "errorbar") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
ggplot(aq_data, aes(reorder(month_abb,-Temp, mean), Temp)) +
  geom_bar(stat = "summary",
           fun.y = "mean",
           fill = "tomato"           ) +
  stat_summary(fun.data = mean_se, geom = "errorbar") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
ggplot(df, aes(x, y, fill = group))
aq_data <- airquality %>%
  arrange(Month) %>%
  mutate(month_abb = fct_inorder(month.abb[Month]))

ggplot(data = aq_data, aes(month_abb, Temp, fill = month_abb)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)") +
  scale_fill_brewer(palette = "Paired")
ggplot(airquality, aes(month.abb[Month], Temp, fill = ..y..)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)",
       fill = "Temp")
ggplot(df, aes(x, y)) +
  geom_point()
ggplot(df, aes(x , y)) +
  geom_point() +
  geom_line()
ggplot(economics, aes(date , unemploy)) +
  geom_point() +
  geom_line()
ggplot(df, aes(x, y)) +
  geom_line(linetype = 2,
            size = 2,
            col = "red")
x <- 1:10
y1 <- x**1.5
y2 <- x**2
y3 <- x**2.5
df <- data.frame(x, y1, y2, y3)
head(df, 3)
#>   x   y1 y2    y3
#> 1 1 1.00  1  1.00
#> 2 2 2.83  4  5.66
#> 3 3 5.20  9 15.59
df_long <- gather(df, bucket, y, -x)
head(df_long, 3)
#>   x bucket    y
#> 1 1     y1 1.00
#> 2 2     y1 2.83
#> 3 3     y1 5.20
tail(df_long, 3)
#>     x bucket   y
#> 28  8     y3 181
#> 29  9     y3 243
#> 30 10     y3 316
ggplot(df_long, aes(x, y, col = bucket)) +
  geom_line()
ggplot(df, aes(x, y1, size = y2)) +
  geom_line() +
  scale_size(name = "Thickness based on y2")
# example data
n <- 20

x1 <- 1:n
y1 <- rnorm(n, 0, .5)
df1 <- data.frame(x1, y1)

x2 <- (.5 * n):((1.5 * n) - 1)
y2 <- rnorm(n, 1, .5)
df2 <- data.frame(x2, y2)
ggplot() +
  geom_line(data = df1, aes(x = x1, y = y1), color = "darkblue") +
  geom_line(data = df2, aes(x = x2, y = y2), linetype = "dashed")
ggplot() +
  geom_line(data = df1, aes(x = x1, y = y1), color = "darkblue") +
  geom_line(data = df2, aes(x = x2, y = y2), linetype = "dashed") +
  xlim(0, 35) +
  ylim(-2, 2)
# using the data.frame df1 from the prior recipe
ggplot(df1) +
  aes(x = x1, y = y1) +
  geom_point() +
  geom_vline(
    xintercept = 10,
    color = "red",
    linetype = "dashed",
    size = 1.5
  ) +
  geom_hline(yintercept = 0, color = "blue")
samp <- rnorm(1000)
samp_df <- data.frame(samp, x = 1:length(samp))

mean_line <- mean(samp_df$samp)
sd_lines <- mean_line + c(-2, -1, +1, +2) * sd(samp_df$samp)

ggplot(samp_df) +
  aes(x = x, y = samp) +
  geom_point() +
  geom_hline(yintercept = mean_line, color = "darkblue") +
  geom_hline(yintercept = sd_lines, linetype = "dotted")
ggplot(samp_df) +
  aes(y = samp) +
  geom_boxplot()
ggplot(samp_df) +
  aes(y = samp) +
  geom_boxplot() +
  coord_flip()
ggplot(df) +
  aes(x = factor, y = values) +
  geom_boxplot()
data(UScereal, package = "MASS")

ggplot(UScereal) +
  aes(x = as.factor(shelf), y = sugars) +
  geom_boxplot() +
  labs(
    title = "Sugar Content by Shelf",
    x = "Shelf",
    y = "Sugar (grams per portion)"
  )
data(Cars93, package = "MASS")

ggplot(Cars93) +
  geom_histogram(aes(x = MPG.city))
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Cars93) +
  geom_histogram(aes(x = MPG.city), bins = 13)
ggplot(Cars93) +
  aes(x = MPG.city) +
  geom_histogram(aes(y = ..density..), bins = 21) +
  geom_density()
samp <- rgamma(500, 2, 2)

ggplot() +
  aes(x = samp) +
  geom_histogram(aes(y = ..density..), bins = 10) +
  geom_density()
df <- data.frame(x = rnorm(100))

ggplot(df, aes(sample = x)) +
  stat_qq() +
  stat_qq_line()
ggplot(Cars93, aes(sample = Price)) +
  stat_qq() +
  stat_qq_line()
ggplot(Cars93, aes(sample = log(Price))) +
  stat_qq() +
  stat_qq_line()
df_t <- data.frame(y = rt(100, 5))
est_df <- as.list(MASS::fitdistr(df_t$y, "t")$estimate)[["df"]]
#> Warning in log(s): NaNs produced

#> Warning in log(s): NaNs produced

#> Warning in log(s): NaNs produced
est_df
#> [1] 19.5
ggplot(df_t) +
  aes(sample = y) +
  geom_qq(distribution = qt, dparams = est_df) +
  stat_qq_line(distribution = qt, dparams = est_df)
rate <- 1 / 10
n <- 1000
df_exp <- data.frame(y = rexp(n, rate = rate))
est_exp <- as.list(MASS::fitdistr(df_exp$y, "exponential")$estimate)[["rate"]]
est_exp
#> [1] 0.101
ggplot(df_exp) +
  aes(sample = y) +
  geom_qq(distribution = qexp, dparams = est_exp) +
  stat_qq_line(distribution = qexp, dparams = est_exp)
df <- data.frame(x = rnorm(200), y = rnorm(200))

ggplot(df) +
  aes(x = x, y = y) +
  geom_point(color = "blue")
df <- data.frame(
  x = 1:100,
  y = rnorm(100)
)

ggplot(df) +
  aes(x, y) +
  geom_point()
shade <- if_else(df$y >= 0, "black", "gray")

ggplot(df) +
  aes(x, y) +
  geom_point(color = shade)
ggplot(data.frame(x = c(-3, 3))) +
  aes(x) +
  stat_function(fun = sin)
ggplot(data.frame(x = c(-3.5, 3.5))) +
  aes(x) +
  stat_function(fun = dnorm) +
  ggtitle("Std. Normal Density")
f <- function(x) exp(-abs(x)) * sin(2 * pi * x)
ggplot(data.frame(x = c(-3.5, 3.5))) +
  aes(x) +
  stat_function(fun = f) +
  ggtitle("Dampened Sine Wave")
par(ask = TRUE)
par(ask = FALSE)
Hit <Return> to see next plot:
par(ask = TRUE)

for (i in (11:15)) {
  g <- ggplot(data.frame(x = rnorm(i), y = 1:i)) +
    aes(x, y) +
    geom_point()
  print(g)
}

# don't forget to turn ask off after you're done
par(ask = FALSE)
# example data
z <- rnorm(1000)
y <- runif(1000)

# plot elements
p1 <- ggplot() +
  geom_point(aes(x = 1:1000, y = z))
p2 <- ggplot() +
  geom_point(aes(x = 1:1000, y = y))
p3 <- ggplot() +
  geom_density(aes(z))
p4 <- ggplot() +
  geom_density(aes(y))
devtools::install_github("thomasp85/patchwork")
library(patchwork)
p1 + p2 + p3 + p4
p3 / (p1 + p2 + p4)
library(patchwork)

df <- data.frame(x = c(0, 1))

g1 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, 2, 4)
  ) +
  ggtitle("First")

g2 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, 4, 1)
  ) +
  ggtitle("Second")

g3 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, 1, 1)
  ) +
  ggtitle("Third")

g4 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, .5, .5)
  ) +
  ggtitle("Fourth")

g1 + g2 + g3 + g4 + plot_layout(ncol = 2, byrow = TRUE)
g1 + g2 + g3 + g4 + plot_layout(ncol = 2, byrow = FALSE)
ggsave("filename.jpg")
ggsave("g1.png", plot = g1, units = "in", width = 5, height = 4)
set.seed(42)
x <- rnorm(100)
e <- rnorm(100, mean=0, sd=5)
y <- 5 + 15 * x + e
set.seed(42)
x <- rnorm(100)
e <- rnorm(100, mean = 0, sd = 5)
y <- 5 + 15 * x + e

lm(y ~ x)
#>
#> Call:
#> lm(formula = y ~ x)
#>
#> Coefficients:
#> (Intercept)            x
#>        4.56        15.14
Coefficients:
(Intercept)            x
      4.558       15.136
df <- data.frame(x, y)
head(df)
#>        x     y
#> 1  1.371 31.57
#> 2 -0.565  1.75
#> 3  0.363  5.43
#> 4  0.633 23.74
#> 5  0.404  7.73
#> 6 -0.106  3.94
lm(y ~ x, data = df)          # Take x and y from df
#>
#> Call:
#> lm(formula = y ~ x, data = df)
#>
#> Coefficients:
#> (Intercept)            x
#>        4.56        15.14
lm(y ~ u + v + w)
set.seed(42)
u <- rnorm(100)
v <- rnorm(100, mean = 3,  sd = 2)
w <- rnorm(100, mean = -3, sd = 1)
e <- rnorm(100, mean = 0,  sd = 3)

y <- 5 + 4 * u + 3 * v + 2 * w + e

lm(y ~ u + v + w)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Coefficients:
#> (Intercept)            u            v            w
#>        4.77         4.17         3.01         1.91
df <- data.frame(y, u, v, w)
head(df)
#>       y      u     v     w
#> 1 16.67  1.371 5.402 -5.00
#> 2 14.96 -0.565 5.090 -2.67
#> 3  5.89  0.363 0.994 -1.83
#> 4 27.95  0.633 6.697 -0.94
#> 5  2.42  0.404 1.666 -4.38
#> 6  5.73 -0.106 3.211 -4.15
lm(y ~ u + v + w, data = df)
#>
#> Call:
#> lm(formula = y ~ u + v + w, data = df)
#>
#> Coefficients:
#> (Intercept)            u            v            w
#>        4.77         4.17         3.01         1.91
m <- lm(y ~ u + v + w)
lm(y ~ u + v + w)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Coefficients:
#> (Intercept)            u            v            w
#>        4.77         4.17         3.01         1.91
m <- lm(y ~ u + v + w)
summary(m)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -5.383 -1.760 -0.312  1.856  6.984
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    4.770      0.969    4.92  3.5e-06 ***
#> u              4.173      0.260   16.07  < 2e-16 ***
#> v              3.013      0.148   20.31  < 2e-16 ***
#> w              1.905      0.266    7.15  1.7e-10 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.66 on 96 degrees of freedom
#> Multiple R-squared:  0.885,  Adjusted R-squared:  0.882
#> F-statistic:  247 on 3 and 96 DF,  p-value: <2e-16
    coef(m)
#> (Intercept)           u           v           w
#>        4.77        4.17        3.01        1.91
    confint(m)
#>             2.5 % 97.5 %
#> (Intercept)  2.85   6.69
#> u            3.66   4.69
#> v            2.72   3.31
#> w            1.38   2.43
    resid(m)
#>       1       2       3       4       5       6       7       8       9
#> -0.5675  2.2880  0.0972  2.1474 -0.7169 -0.3617  1.0350  2.8040 -4.2496
#>      10      11      12      13      14      15      16      17      18
#> -0.2048 -0.6467 -2.5772 -2.9339 -1.9330  1.7800 -1.4400 -2.3989  0.9245
#>      19      20      21      22      23      24      25      26      27
#> -3.3663  2.6890 -1.4190  0.7871  0.0355 -0.3806  5.0459 -2.5011  3.4516
#>      28      29      30      31      32      33      34      35      36
#>  0.3371 -2.7099 -0.0761  2.0261 -1.3902 -2.7041  0.3953  2.7201 -0.0254
#>      37      38      39      40      41      42      43      44      45
#> -3.9887 -3.9011 -1.9458 -1.7701 -0.2614  2.0977 -1.3986 -3.1910  1.8439
#>      46      47      48      49      50      51      52      53      54
#>  0.8218  3.6273 -5.3832  0.2905  3.7878  1.9194 -2.4106  1.6855 -2.7964
#>      55      56      57      58      59      60      61      62      63
#> -1.3348  3.3549 -1.1525  2.4012 -0.5320 -4.9434 -2.4899 -3.2718 -1.6161
#>      64      65      66      67      68      69      70      71      72
#> -1.5119 -0.4493 -0.9869  5.6273 -4.4626 -1.7568  0.8099  5.0320  0.1689
#>      73      74      75      76      77      78      79      80      81
#>  3.5761 -4.8668  4.2781 -2.1386 -0.9739 -3.6380  0.5788  5.5664  6.9840
#>      82      83      84      85      86      87      88      89      90
#> -3.5119  1.2842  4.1445 -0.4630 -0.7867 -0.7565  1.6384  3.7578  1.8942
#>      91      92      93      94      95      96      97      98      99
#>  0.5542 -0.8662  1.2041 -1.7401 -0.7261  3.2701  1.4012  0.9476 -0.9140
#>     100
#>  2.4278
    deviance(m)
#> [1] 679
    anova(m)
#> Analysis of Variance Table
#>
#> Response: y
#>           Df Sum Sq Mean Sq F value  Pr(>F)
#> u          1   1776    1776   251.0 < 2e-16 ***
#> v          1   3097    3097   437.7 < 2e-16 ***
#> w          1    362     362    51.1 1.7e-10 ***
#> Residuals 96    679       7
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(y ~ u + v + w))
lm(y ~ u + v + w) %>%
  summary
summary(m)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -5.383 -1.760 -0.312  1.856  6.984
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    4.770      0.969    4.92  3.5e-06 ***
#> u              4.173      0.260   16.07  < 2e-16 ***
#> v              3.013      0.148   20.31  < 2e-16 ***
#> w              1.905      0.266    7.15  1.7e-10 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.66 on 96 degrees of freedom
#> Multiple R-squared:  0.885,  Adjusted R-squared:  0.882
#> F-statistic:  247 on 3 and 96 DF,  p-value: <2e-16
    summary(m)$call
    # Residuals:
    #     Min      1Q  Median      3Q     Max
    # -5.3832 -1.7601 -0.3115  1.8565  6.9840
summary(m)$coefficients
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)     4.77      0.969    4.92 3.55e-06
#> u               4.17      0.260   16.07 5.76e-29
#> v               3.01      0.148   20.31 1.58e-36
#> w               1.91      0.266    7.15 1.71e-10
+
+
Residual standard error::
+
[source, r]
# Residual standard error: 2.66 on 96 degrees of freedom
+
-------------------------------------------------------------------
This reports the standard error of the residuals (*σ*)—that is, the
sample standard deviation of *ε*.
-------------------------------------------------------------------

_R_^2^ (coefficient of determination)::
+
[source, r]
# Multiple R-squared:  0.8851,  Adjusted R-squared:  0.8815
+
-------------------------------------------------------------------
*R*^2^ is a measure of the model’s quality. Bigger is better.
Mathematically, it is the fraction of the variance of *y* that is
explained by the regression model. The remaining variance is not
explained by the model, so it must be due to other factors (i.e.,
unknown variables or sampling variability). In this case, the model
explains 0.4981 (49.81%) of the variance of *y*, and the remaining
0.5019 (50.19%) is unexplained.

That being said, we strongly suggest using the adjusted rather than
the basic *R*^2^. The adjusted value accounts for the number of
variables in your model and so is a more realistic assessment of
its effectiveness. In this case, then, we would use 0.8815,
not 0.8851s
-------------------------------------------------------------------

_F_ statistic::
+
[source, r]
# F-statistic: 246.6 on 3 and 96 DF,  p-value: < 2.2e-16
+
--------------------------------------------------------------------
The *F* statistic tells you whether the model is significant
or insignificant. The model is significant if any of the
coefficients are nonzero (i.e., if *β*~*i*~ ≠ 0 for some *i*). It is
insignificant if all coefficients are zero (*β*~1~ = *β*~2~ = … =
*β*~*n*~ = 0).

Conventionally, a *p*-value of less than 0.05 indicates that the
model is likely significant (one or more *β*~*i*~ are nonzero)
whereas values exceeding 0.05 indicate that the model is likely
not significant. Here, the probability is only 0.000391 that our
model is insignificant. That’s good.

Most people look at the *R*^2^ statistic first. The statistician
wisely starts with the *F* statistic, for if the model is not
significant then nothing else matters.
--------------------------------------------------------------------

[[see_also-id240]]
==== See Also

See <<recipe-id231>> for more on extracting statistics and information from the
model object.

[[recipe-id205]]
=== Performing Linear Regression Without an Intercept

[[problem-id205]]
==== Problem

You want to perform a linear regression, but you want to force the
intercept to be zero.

[[solution-id205]]
==== Solution

Add "`+` `0`" to the righthand side of your regression formula. That
will force `lm` to fit the model with a zero intercept:

[source, r]
The corresponding regression equation is:

++++
<ul class="simplelist">
  <li><em>y</em><sub><em>i</em></sub> = <em>βx</em><sub><em>i</em></sub> + <em>ε</em><sub><em>i</em></sub></li>
</ul>
++++

[[discussion-id205]]
==== Discussion

Linear regression ordinarily includes an intercept term, so that is the
default in R. In rare cases, however, you may want to fit the data while
assuming that the intercept is zero. In this you make a modeling
assumption: when _x_ is zero, _y_ should be zero.

When you force a zero intercept, the `lm` output includes a coefficient
for _x_ but no intercept for _y_, as shown here:

[source, r]
We strongly suggest you check that modeling assumption before
proceeding. Perform a regression with an intercept; then see if the
intercept could plausibly be zero. Check the intercept’s confidence
interval. In this example, the confidence interval is (6.26, 8.84):

[source, r]
Because the confidence interval does not contain zero, it is NOT
statistically plausible that the intercept could be zero. So in this
case, it is not reasonable to rerun the regression while forcing a zero
intercept.

[[title-highcor]]
=== Regressing Only Variables that Highly Correlate with your Dependent Variable

[[problem-highcor]]
==== Problem

You have a data frame with many variables and you want to build a
multiple linear regression using only the variables that are highly
correlated to your response (dependent) variable.

[[solution-highcor]]
==== Solution

If `df` is our data frame containing both our response (dependent) and
all our predictor (independent) variables and `dep_var` is our response
variable, we can figure out our best predictors and then use them in a
linear regression. If we want the top 4 predictor variables, we can use
this recipe:

[source, r]
This recipe is a combination of many differnt pieces of logic used
elsewhere in this book. We will describe each step here then walk
through it in the discussion using some example data.

First we drop the response variable out of our pipe chain so that we
have only our predictor variables in our data flow:

[source, r]
Then we use `map_dbl` from `purrr` to perform a pairwise correlation on
each column relative to the response variable.

[source, r]
map_dbl(cor, y = df$dep_var) %>%
We then take the resulting correlations and sort them in decreasing
order:

[source, r]
sort(decreasing = TRUE) %>%
We want only the top 4 correlated variables so we select the top 4
records in the resulting vector:

[source, r]
.[1:4] %>%
And we don't need the correlation values, only the names of the rows
which are the variable names from our original data frame `df`:

[source, r]
Then we can pass those names into our subsetting brackets to select only
the columns with names matching the ones we want:

[source, r]
mod <- lm(df$dep_var ~ as.matrix(best_pred))
# loads the pred data frame
load("./data/pred.rdata")

pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp)
#> pred1 pred2 pred3 pred4 pred5 pred6
#> 0.573 0.279 0.753 0.799 0.322 0.607
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE)
#> pred4 pred3 pred6 pred1 pred5 pred2
#> 0.799 0.753 0.607 0.573 0.322 0.279
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4]
#> pred4 pred3 pred6 pred1
#> 0.799 0.753 0.607 0.573
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4] %>%
  names
#> [1] "pred4" "pred3" "pred6" "pred1"
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4] %>%
  names %>%
  pred[.] %>%
  head
#>    pred4   pred3  pred6  pred1
#> 1  7.252  1.5127  0.560  0.206
#> 2  2.076  0.2579 -0.124 -0.361
#> 3 -0.649  0.0884  0.657  0.758
#> 4  1.365 -0.1209  0.122 -0.727
#> 5 -5.444 -1.1943 -0.391 -1.368
#> 6  2.554  0.6120  1.273  0.433
best_pred <- pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4] %>%
  names %>%
  pred[.]

mod <- lm(pred$resp ~ as.matrix(best_pred))
summary(mod)
#>
#> Call:
#> lm(formula = pred$resp ~ as.matrix(best_pred))
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -1.485 -0.619  0.189  0.562  1.398
#>
#> Coefficients:
#>                           Estimate Std. Error t value Pr(>|t|)
#> (Intercept)                  1.117      0.340    3.28   0.0051 **
#> as.matrix(best_pred)pred4    0.523      0.207    2.53   0.0231 *
#> as.matrix(best_pred)pred3   -0.693      0.870   -0.80   0.4382
#> as.matrix(best_pred)pred6    1.160      0.682    1.70   0.1095
#> as.matrix(best_pred)pred1    0.343      0.359    0.95   0.3549
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.927 on 15 degrees of freedom
#> Multiple R-squared:  0.838,  Adjusted R-squared:  0.795
#> F-statistic: 19.4 on 4 and 15 DF,  p-value: 8.59e-06
lm(y ~ u*v)
y ~ u * v
y ~ u * v * w
y ~ u + v + w + u:v:w
y ~ u * v
y ~ u + v + u:v
y ~ (u + v) ^ 2
full.model <- lm(y ~ x1 + x2 + x3 + x4)
reduced.model <- step(full.model, direction = "backward")
min.model <- lm(y ~ 1)
fwd.model <-
  step(min.model,
       direction = "forward",
       scope = (~ x1 + x2 + x3 + x4))
# example data
set.seed(4)
n <- 150
x1 <- rnorm(n)
x2 <- rnorm(n, 1, 2)
x3 <- rnorm(n, 3, 1)
x4 <- rnorm(n,-2, 2)
e <- rnorm(n, 0, 3)
y <- 4 + x1 + 5 * x3 + e

# build the model
full.model <- lm(y ~ x1 + x2 + x3 + x4)
summary(full.model)
#>
#> Call:
#> lm(formula = y ~ x1 + x2 + x3 + x4)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -8.032 -1.774  0.158  2.032  6.626
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)  3.40224    0.80767    4.21  4.4e-05 ***
#> x1           0.53937    0.25935    2.08    0.039 *
#> x2           0.16831    0.12291    1.37    0.173
#> x3           5.17410    0.23983   21.57  < 2e-16 ***
#> x4          -0.00982    0.12954   -0.08    0.940
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.92 on 145 degrees of freedom
#> Multiple R-squared:  0.77,   Adjusted R-squared:  0.763
#> F-statistic:  121 on 4 and 145 DF,  p-value: <2e-16
reduced.model <- step(full.model, direction="backward")
#> Start:  AIC=327
#> y ~ x1 + x2 + x3 + x4
#>
#>        Df Sum of Sq  RSS AIC
#> - x4    1         0 1240 325
#> - x2    1        16 1256 327
#> <none>              1240 327
#> - x1    1        37 1277 329
#> - x3    1      3979 5219 540
#>
#> Step:  AIC=325
#> y ~ x1 + x2 + x3
#>
#>        Df Sum of Sq  RSS AIC
#> - x2    1        16 1256 325
#> <none>              1240 325
#> - x1    1        37 1277 327
#> - x3    1      3988 5228 539
#>
#> Step:  AIC=325
#> y ~ x1 + x3
#>
#>        Df Sum of Sq  RSS AIC
#> <none>              1256 325
#> - x1    1        44 1300 328
#> - x3    1      3974 5230 537
summary(reduced.model)
#>
#> Call:
#> lm(formula = y ~ x1 + x3)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -8.148 -1.850 -0.055  2.026  6.550
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    3.648      0.751    4.86    3e-06 ***
#> x1             0.582      0.255    2.28    0.024 *
#> x3             5.147      0.239   21.57   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.92 on 147 degrees of freedom
#> Multiple R-squared:  0.767,  Adjusted R-squared:  0.763
#> F-statistic:  241 on 2 and 147 DF,  p-value: <2e-16
min.model <- lm(y ~ 1)
fwd.model <- step(
  min.model,
  direction = "forward",
  scope = (~ x1 + x2 + x3 + x4),
  trace = 0
)
summary(fwd.model)
#>
#> Call:
#> lm(formula = y ~ x3 + x1)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -8.148 -1.850 -0.055  2.026  6.550
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    3.648      0.751    4.86    3e-06 ***
#> x3             5.147      0.239   21.57   <2e-16 ***
#> x1             0.582      0.255    2.28    0.024 *
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.92 on 147 degrees of freedom
#> Multiple R-squared:  0.767,  Adjusted R-squared:  0.763
#> F-statistic:  241 on 2 and 147 DF,  p-value: <2e-16
full.model <- lm(y ~ (x1 + x2 + x3 + x4) ^ 4)
reduced.model <- step(full.model, direction = "backward")
#> Start:  AIC=337
#> y ~ (x1 + x2 + x3 + x4)^4
#>
#>               Df Sum of Sq  RSS AIC
#> - x1:x2:x3:x4  1    0.0321 1145 335
#> <none>                     1145 337
#>
#> Step:  AIC=335
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x3:x4 + x1:x2:x3 + x1:x2:x4 + x1:x3:x4 + x2:x3:x4
#>
#>            Df Sum of Sq  RSS AIC
#> - x2:x3:x4  1      0.76 1146 333
#> - x1:x3:x4  1      8.37 1154 334
#> <none>                  1145 335
#> - x1:x2:x4  1     20.95 1166 336
#> - x1:x2:x3  1     25.18 1170 336
#>
#> Step:  AIC=333
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x3:x4 + x1:x2:x3 + x1:x2:x4 + x1:x3:x4
#>
#>            Df Sum of Sq  RSS AIC
#> - x1:x3:x4  1      8.74 1155 332
#> <none>                  1146 333
#> - x1:x2:x4  1     21.72 1168 334
#> - x1:x2:x3  1     26.51 1172 334
#>
#> Step:  AIC=332
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x3:x4 + x1:x2:x3 + x1:x2:x4
#>
#>            Df Sum of Sq  RSS AIC
#> - x3:x4     1      0.29 1155 330
#> <none>                  1155 332
#> - x1:x2:x4  1     23.24 1178 333
#> - x1:x2:x3  1     31.11 1186 334
#>
#> Step:  AIC=330
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x1:x2:x3 + x1:x2:x4
#>
#>            Df Sum of Sq  RSS AIC
#> <none>                  1155 330
#> - x1:x2:x4  1      23.4 1178 331
#> - x1:x2:x3  1      31.5 1187 332
lm(y ~ x1, subset=1:100)          # Use only x[1:100]
## example data
n <- 1000
x <- rnorm(n)
e <- rnorm(n, 0, .5)
y <- 3 + 2 * x + e
lm(y ~ x, subset = 1:500)
#>
#> Call:
#> lm(formula = y ~ x, subset = 1:500)
#>
#> Coefficients:
#> (Intercept)            x
#>           3            2
lm(y ~ x, subset = 1:floor(length(x) / 2))
#>
#> Call:
#> lm(formula = y ~ x, subset = 1:floor(length(x)/2))
#>
#> Coefficients:
#> (Intercept)            x
#>           3            2
load('./data/lab_df.rdata')
lm(y ~ x, subset = (lab == "NJ"), data = lab_df)
#>
#> Call:
#> lm(formula = y ~ x, data = lab_df, subset = (lab == "NJ"))
#>
#> Coefficients:
#> (Intercept)            x
#>        2.58         5.03
lm(y ~ u + v)    # Not quite right
lm(y ~ u + u ^ 2)  # That's an interaction, not a quadratic term
lm(y ~ I(u + v))
lm(y ~ u + I(u ^ 2))
load('./data/df_squared.rdata')
m <- lm(y ~ u + I(u ^ 2), data = df_squared)
predict(m, newdata = data.frame(u = 13.4))
#>   1
#> 877
lm(y ~ poly(x, 3, raw = TRUE))
x_sq <- x ^ 2
x_cub <- x ^ 3
m <- lm(y ~ x + x_sq + x_cub)
m <- lm(y ~ poly(x, 3, raw = TRUE))
lm(y ~ x + x^2 + x^3)     # Does not do what you think!
lm(y ~ x + I(x ^ 2) + I(x ^ 3))
lm(log(y) ~ x)
# read in our example data
load(file = './data/df_decay.rdata')
z <- df_decay$z
t <- df_decay$time

# transform and model
m <- lm(log(z) ~ t)
summary(m)
#>
#> Call:
#> lm(formula = log(z) ~ t)
#>
#> Residuals:
#>     Min      1Q  Median      3Q     Max
#> -0.4479 -0.0993  0.0049  0.0978  0.2802
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)   0.6887     0.0306    22.5   <2e-16 ***
#> t            -2.0118     0.0351   -57.3   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.148 on 98 degrees of freedom
#> Multiple R-squared:  0.971,  Adjusted R-squared:  0.971
#> F-statistic: 3.28e+03 on 1 and 98 DF,  p-value: <2e-16
lm(sqrt(y) ~ month)
lm(y ~ sqrt(x))
lm(log(y) ~ log(x))
library(MASS)
m <- lm(y ~ x)
boxcox(m)
set.seed(9)
x <- 10:100
eps <- rnorm(length(x), sd = 5)
y <- (x + eps) ^ (-1 / 1.5)
m <- lm(y ~ x)
summary(m)
#>
#> Call:
#> lm(formula = y ~ x)
#>
#> Residuals:
#>      Min       1Q   Median       3Q      Max
#> -0.04032 -0.01633 -0.00792  0.00996  0.14516
#>
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)
#> (Intercept)  0.166885   0.007078    23.6   <2e-16 ***
#> x           -0.001465   0.000116   -12.6   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.0291 on 89 degrees of freedom
#> Multiple R-squared:  0.641,  Adjusted R-squared:  0.637
#> F-statistic:  159 on 1 and 89 DF,  p-value: <2e-16
plot(m, which = 1)       # Plot only the fitted vs residuals
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#>     select
bc <- boxcox(m)
which.max(bc$y)
#> [1] 13
lambda <- bc$x[which.max(bc$y)]
lambda
#> [1] -1.52
z <- y ^ lambda
m2 <- lm(z ~ x)
summary(m2)
#>
#> Call:
#> lm(formula = z ~ x)
#>
#> Residuals:
#>     Min      1Q  Median      3Q     Max
#> -13.459  -3.711  -0.228   2.206  14.188
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)  -0.6426     1.2517   -0.51     0.61
#> x             1.0514     0.0205   51.20   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 5.15 on 89 degrees of freedom
#> Multiple R-squared:  0.967,  Adjusted R-squared:  0.967
#> F-statistic: 2.62e+03 on 1 and 89 DF,  p-value: <2e-16
m2 <- lm(I(y ^ lambda) ~ x)
load(file = './data/conf.rdata')
m <- lm(y ~ x1 + x2)
confint(m)
#>             2.5 % 97.5 %
#> (Intercept) -3.90   6.47
#> x1          -2.58   6.24
#> x2           4.67   5.17
confint(m)
#>             2.5 % 97.5 %
#> (Intercept) -3.90   6.47
#> x1          -2.58   6.24
#> x2           4.67   5.17
confint(m, level = 0.99)
#>             0.5 % 99.5 %
#> (Intercept) -5.72   8.28
#> x1          -4.12   7.79
#> x2           4.58   5.26
m <- lm(y ~ x1 + x2)
plot(m, which = 1)
m <- lm(y ~ x1 + x2)
plot(m)
library(car)
#> Loading required package: carData
#>
#> Attaching package: 'car'
#> The following object is masked from 'package:dplyr':
#>
#>     recode
#> The following object is masked from 'package:purrr':
#>
#>     some
outlierTest(m)
#> No Studentized residuals with Bonferonni p < 0.05
#> Largest |rstudent|:
#>   rstudent unadjusted p-value Bonferonni p
#> 2     2.27             0.0319        0.956
length(x1)
#> [1] 30
length(x2)
#> [1] 30
length(y)
#> [1] 30

m <- lm(y ~ x1 + x2)
par(mfrow = (c(2, 2))) # this gives us a 2x2 plot
plot(m)
load(file = './data/bad.rdata')
m <- lm(y2 ~ x3 + x4)
par(mfrow = (c(2, 2)))      # this gives us a 2x2 plot
plot(m)
outlierTest(m)
#>    rstudent unadjusted p-value Bonferonni p
#> 28     4.46           7.76e-05       0.0031
influence.measures(m)
influence.measures(m)
#> Influence measures of
#>   lm(formula = y2 ~ x3 + x4) :
#>
#>      dfb.1_   dfb.x3   dfb.x4    dffit cov.r   cook.d    hat inf
#> 1  -0.18784  0.15174  0.07081 -0.22344 1.059 1.67e-02 0.0506
#> 2   0.27637 -0.04367 -0.39042  0.45416 1.027 6.71e-02 0.0964
#> 3  -0.01775 -0.02786  0.01088 -0.03876 1.175 5.15e-04 0.0772
#> 4   0.15922 -0.14322  0.25615  0.35766 1.133 4.27e-02 0.1156
#> 5  -0.10537  0.00814 -0.06368 -0.13175 1.078 5.87e-03 0.0335
#> 6   0.16942  0.07465  0.42467  0.48572 1.034 7.66e-02 0.1062
etc ...
library(lmtest)
m <- lm(y ~ x)           # Create a model object
dwtest(m)                # Test the model residuals
acf(m)                   # Plot the ACF of the model residuals
library(lmtest)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#>     as.Date, as.Date.numeric
load(file = './data/ac.rdata')
m <- lm(y1 ~ x)
dwtest(m)
#>
#>  Durbin-Watson test
#>
#> data:  m
#> DW = 2, p-value = 0.4
#> alternative hypothesis: true autocorrelation is greater than 0
m <- lm(y2 ~ x)
dwtest(m)
#>
#>  Durbin-Watson test
#>
#> data:  m
#> DW = 2, p-value = 0.01
#> alternative hypothesis: true autocorrelation is greater than 0
dwtest(m, alternative = "two.sided")
load(file = './data/pred2.rdata')

m <- lm(y ~ u + v + w)
preds <- data.frame(u = 3.1, v = 4.0, w = 5.5)
predict(m, newdata = preds)
#>  1
#> 45
preds <- data.frame(
  u = c(3.0, 3.1, 3.2, 3.3),
  v = c(3.9, 4.0, 4.1, 4.2),
  w = c(5.3, 5.5, 5.7, 5.9)
)
predict(m, newdata = preds)
#>    1    2    3    4
#> 43.8 45.0 46.3 47.5
predict(m, newdata = preds, interval = "prediction")
predict(m, newdata = preds, interval = "prediction")
#>    fit  lwr  upr
#> 1 43.8 38.2 49.4
#> 2 45.0 39.4 50.7
#> 3 46.3 40.6 51.9
#> 4 47.5 41.8 53.2
oneway.test(x ~ f)
load(file = './data/anova.rdata')
oneway.test(r ~ mon, data = GSPC_df)
#>
#>  One-way analysis of means (not assuming equal variances)
#>
#> data:  r and mon
#> F = 2, num df = 10, denom df = 7000, p-value = 0.03
oneway.test(r ~ mon, data = GSPC_df, subset = tail(seq_along(r), 2500))
#>
#>  One-way analysis of means (not assuming equal variances)
#>
#> data:  r and mon
#> F = 0.7, num df = 10, denom df = 1000, p-value = 0.8
oneway.test(x ~ f, var.equal = TRUE)
m <- aov(x ~ f)
summary(m)
interaction.plot(pred1, pred2, resp)
library(faraway)
data(rats)
interaction.plot(rats$poison, rats$treat, rats$time)
m <- aov(x ~ f)
TukeyHSD(m)
plot(TukeyHSD(m))
load(file = './data/anova.rdata')
oneway.test(r ~ wday, subset = 1:2500, data = GSPC_df)
#>
#>  One-way analysis of means (not assuming equal variances)
#>
#> data:  r and wday
#> F = 10, num df = 4, denom df = 1000, p-value = 5e-10
m <- aov(r ~ wday, subset = 1:2500, data = GSPC_df)
TukeyHSD(m)
#>   Tukey multiple comparisons of means
#>     95% family-wise confidence level
#>
#> Fit: aov(formula = r ~ wday, data = GSPC_df, subset = 1:2500)
#>
#> $wday
#>              diff       lwr       upr p adj
#> Mon-Fri -0.003153 -4.40e-03 -0.001911 0.000
#> Thu-Fri -0.000934 -2.17e-03  0.000304 0.238
#> Tue-Fri -0.001855 -3.09e-03 -0.000618 0.000
#> Wed-Fri -0.000783 -2.01e-03  0.000448 0.412
#> Thu-Mon  0.002219  9.79e-04  0.003460 0.000
#> Tue-Mon  0.001299  5.85e-05  0.002538 0.035
#> Wed-Mon  0.002370  1.14e-03  0.003605 0.000
#> Tue-Thu -0.000921 -2.16e-03  0.000314 0.249
#> Wed-Thu  0.000151 -1.08e-03  0.001380 0.997
#> Wed-Tue  0.001072 -1.57e-04  0.002300 0.121
plot(TukeyHSD(m))
kruskal.test(x ~ f)
load(file = './data/student_data.rdata')
head(student_data)
#> # A tibble: 6 x 4
#>   att.fact hw.mean midterm hw
#>   <fct>      <dbl>   <dbl> <fct>
#> 1 3          0.808   0.818 4
#> 2 3          0.830   0.682 4
#> 3 3          0.444   0.511 2
#> 4 3          0.663   0.670 3
#> 5 2          0.9     0.682 4
#> 6 3          0.948   0.954 4
kruskal.test(midterm ~ hw, data = student_data)
#>
#>  Kruskal-Wallis rank sum test
#>
#> data:  midterm by hw
#> Kruskal-Wallis chi-squared = 30, df = 4, p-value = 4e-05
anova(m1, m2)
load(file = './data/anova2.rdata')
m1 <- lm(y ~ u)
m2 <- lm(y ~ u + v)
m3 <- lm(y ~ u + v + w)
anova(m1, m2)
#> Analysis of Variance Table
#>
#> Model 1: y ~ u
#> Model 2: y ~ u + v
#>   Res.Df RSS Df Sum of Sq    F Pr(>F)
#> 1     18 197
#> 2     17 130  1      66.4 8.67 0.0091 **
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m2, m3)
#> Analysis of Variance Table
#>
#> Model 1: y ~ u + v
#> Model 2: y ~ u + v + w
#>   Res.Df RSS Df Sum of Sq    F Pr(>F)
#> 1     17 130
#> 2     16 103  1      27.5 4.27  0.055 .
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
$ sudo apt-get install r-base
$ sudo yum install R.i386
$ sudo apt-get install r-base-html r-doc-html
1 + 1
#> [1] 2
max(1, 3, 5)
#> [1] 5
max(
  1, 3,
  +5
)
#> [1] 5
q()
help.start()
help(functionname)
args(functionname)
example(functionname)
help(mean)
?mean
args(mean)
#> function (x, ...)
#> NULL
args(sd)
#> function (x, na.rm = FALSE)
#> NULL
example(mean)
#>
#> mean> x <- c(0:10, 50)
#>
#> mean> xm <- mean(x)
#>
#> mean> c(xm, mean(x, trim = 0.10))
#> [1] 8.75 5.50
help.search("pattern")
> ??pattern
help(adf.test)
#> No documentation for 'adf.test' in specified packages and libraries:
#> you could try '??adf.test'
help.search("adf.test")
Help files with alias or concept or title matching 'adf.test' using
regular expression matching:

tseries::adf.test       Augmented Dickey-Fuller Test

Type '?PKG::FOO' to inspect entry 'PKG::FOO TITLE'.
help(adf.test, package = "tseries")
?tseries::adf.test
help.search("dickey-fuller")
Help files with alias or concept or title matching 'dickey-fuller' using
fuzzy matching:

fUnitRoots::DickeyFullerPValues
                         Dickey-Fuller p Values
tseries::adf.test        Augmented Dickey-Fuller Test
urca::ur.df              Augmented-Dickey-Fuller Unit Root Test

Type '?PKG::FOO' to inspect entry 'PKG::FOO TITLE'.
help(package = "packagename")
help(package = "tseries")
vignette()
vignette(package = "packagename")
vignette("vignettename")
RSiteSearch("key phrase")
RSiteSearch("canonical correlation")
set.seed(42)
n <- 4
example_df <- data.frame(
  some_reals = rnorm(n),
  some_letters = sample(LETTERS, n, replace = TRUE),
  some_ints = sample(1:10, n, replace = TRUE)
)
example_df
#>   some_reals some_letters some_ints
#> 1      1.371            R        10
#> 2     -0.565            S         3
#> 3      0.363            L         5
#> 4      0.633            S        10
data(mtcars)
head(mtcars)
#>                    mpg cyl disp  hp drat   wt qsec vs am gear carb
#> Mazda RX4         21.0   6  160 110 3.90 2.62 16.5  0  1    4    4
#> Mazda RX4 Wag     21.0   6  160 110 3.90 2.88 17.0  0  1    4    4
#> Datsun 710        22.8   4  108  93 3.85 2.32 18.6  1  1    4    1
#> Hornet 4 Drive    21.4   6  258 110 3.08 3.21 19.4  1  0    3    1
#> Hornet Sportabout 18.7   8  360 175 3.15 3.44 17.0  0  0    3    2
#> Valiant           18.1   6  225 105 2.76 3.46 20.2  1  0    3    1
dput(head(mtcars, 2))
#> structure(list(mpg = c(21, 21), cyl = c(6, 6), disp = c(160,
#> 160), hp = c(110, 110), drat = c(3.9, 3.9), wt = c(2.62, 2.875
#> ), qsec = c(16.46, 17.02), vs = c(0, 0), am = c(1, 1), gear = c(4,
#> 4), carb = c(4, 4)), row.names = c("Mazda RX4", "Mazda RX4 Wag"
#> ), class = "data.frame")
example_df <- structure(list(mpg = c(21, 21), cyl = c(6, 6), disp = c(160,
160), hp = c(110, 110), drat = c(3.9, 3.9), wt = c(2.62, 2.875
), qsec = c(16.46, 17.02), vs = c(0, 0), am = c(1, 1), gear = c(4,
4), carb = c(4, 4)), row.names = c("Mazda RX4", "Mazda RX4 Wag"
), class = "data.frame")

example_df
#>               mpg cyl disp  hp drat   wt qsec vs am gear carb
#> Mazda RX4      21   6  160 110  3.9 2.62 16.5  0  1    4    4
#> Mazda RX4 Wag  21   6  160 110  3.9 2.88 17.0  0  1    4    4
pi
#> [1] 3.14
sqrt(2)
#> [1] 1.41
print(pi)
#> [1] 3.14
print(sqrt(2))
#> [1] 1.41
print(matrix(c(1, 2, 3, 4), 2, 2))
#>      [,1] [,2]
#> [1,]    1    3
#> [2,]    2    4
print(list("a", "b", "c"))
#> [[1]]
#> [1] "a"
#>
#> [[2]]
#> [1] "b"
#>
#> [[3]]
#> [1] "c"
print("The zero occurs at", 2 * pi, "radians.")
#> Error in print.default("The zero occurs at", 2 * pi, "radians."): invalid 'quote' argument
print("The zero occurs at")
#> [1] "The zero occurs at"
print(2 * pi)
#> [1] 6.28
print("radians")
#> [1] "radians"
cat("The zero occurs at", 2 * pi, "radians.", "\n")
#> The zero occurs at 6.28 radians.
fib <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
cat("The first few Fibonacci numbers are:", fib, "...\n")
#> The first few Fibonacci numbers are: 0 1 1 2 3 5 8 13 21 34 ...
cat(list("a", "b", "c"))
#> Error in cat(list("a", "b", "c")): argument 1 (type 'list') cannot be handled by 'cat'
x <- 3
x <- 3
y <- 4
z <- sqrt(x^2 + y^2)
print(z)
#> [1] 5
x <- 3
print(x)
#> [1] 3

x <- c("fee", "fie", "foe", "fum")
print(x)
#> [1] "fee" "fie" "foe" "fum"
x <<- 3
foo <- 3
print(foo)
#> [1] 3
5 -> fum
print(fum)
#> [1] 5
library(tidyverse)

mpg %>%
  head %>%
  print
#> # A tibble: 6 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~
#> 2 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~
#> 3 audi         a4      2    2008     4 manu~ f        20    31 p     comp~
#> 4 audi         a4      2    2008     4 auto~ f        21    30 p     comp~
#> 5 audi         a4      2.8  1999     6 auto~ f        16    26 p     comp~
#> 6 audi         a4      2.8  1999     6 manu~ f        18    26 p     comp~
print(head(mpg))
#> # A tibble: 6 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 audi         a4      1.8  1999     4 auto~ f        18    29 p     comp~
#> 2 audi         a4      1.8  1999     4 manu~ f        21    29 p     comp~
#> 3 audi         a4      2    2008     4 manu~ f        20    31 p     comp~
#> 4 audi         a4      2    2008     4 auto~ f        21    30 p     comp~
#> 5 audi         a4      2.8  1999     6 auto~ f        16    26 p     comp~
#> 6 audi         a4      2.8  1999     6 manu~ f        18    26 p     comp~
x %>% head

head(x)
mpg %>%
  head %>%
  print

has the same effect as this code which use an intermediate variable.
x <- head(mpg)
print(x)
print(head(mpg))
iris %>% head(10)

head(iris, 10)
10 %>% head(x, .)

head(x, 10)
x <- 10
y <- 50
z <- c("three", "blind", "mice")
f <- function(n, p) sqrt(p * (1 - p) / n)
ls()
#> [1] "f" "x" "y" "z"
ls()
#> character(0)
x <- 10
y <- 50
z <- c("three", "blind", "mice")
f <- function(n, p) sqrt(p * (1 - p) / n)
ls.str()
#> f : function (n, p)
#> x :  num 10
#> y :  num 50
#> z :  chr [1:3] "three" "blind" "mice"
ls()
#> [1] "f" "x" "y" "z"
ls(all.names = TRUE)
#> [1] ".Random.seed" "f"            "x"            "y"
#> [5] "z"
x <- 2 * pi
x
#> [1] 6.28
rm(x)
x
#> Error in eval(expr, envir, enclos): object 'x' not found
rm(x, y, z)
ls()
#> [1] "f" "x" "y" "z"
rm(list = ls())
ls()
#> character(0)
c(1, 1, 2, 3, 5, 8, 13, 21)
#> [1]  1  1  2  3  5  8 13 21
c(1 * pi, 2 * pi, 3 * pi, 4 * pi)
#> [1]  3.14  6.28  9.42 12.57
c("My", "twitter", "handle", "is", "@cmastication")
#> [1] "My"            "twitter"       "handle"        "is"
#> [5] "@cmastication"
c(TRUE, TRUE, FALSE, TRUE)
#> [1]  TRUE  TRUE FALSE  TRUE
v1 <- c(1, 2, 3)
v2 <- c(4, 5, 6)
c(v1, v2)
#> [1] 1 2 3 4 5 6
v1 <- c(1, 2, 3)
v3 <- c("A", "B", "C")
c(v1, v3)
#> [1] "1" "2" "3" "A" "B" "C"
mode(3.1415)
#> [1] "numeric"
mode("foo")
#> [1] "character"
c(3.1415, "foo")
#> [1] "3.1415" "foo"
mode(c(3.1415, "foo"))
#> [1] "character"
x <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
mean(x)
#> [1] 8.8
median(x)
#> [1] 4
sd(x)
#> [1] 11
var(x)
#> [1] 122
x <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
y <- log(x + 1)
cor(x, y)
#> [1] 0.907
cov(x, y)
#> [1] 11.5
x <- c(0, 1, 1, 2, 3, NA)
mean(x)
#> [1] NA
sd(x)
#> [1] NA
x <- c(0, 1, 1, 2, 3, NA)
sd(x, na.rm = TRUE)
#> [1] 1.14
data(cars)

map_dbl(cars, mean)
#> speed  dist
#>  15.4  43.0
map_dbl(cars, sd)
#> speed  dist
#>  5.29 25.77
map_dbl(cars, median)
#> speed  dist
#>    15    36
var(cars)
#>       speed dist
#> speed    28  110
#> dist    110  664
cor(cars)
#>       speed  dist
#> speed 1.000 0.807
#> dist  0.807 1.000
cov(cars)
#>       speed dist
#> speed    28  110
#> dist    110  664
1:5
#> [1] 1 2 3 4 5
seq(from = 1, to = 5, by = 2)
#> [1] 1 3 5
rep(1, times = 5)
#> [1] 1 1 1 1 1
0:9
#>  [1] 0 1 2 3 4 5 6 7 8 9
10:19
#>  [1] 10 11 12 13 14 15 16 17 18 19
9:0
#>  [1] 9 8 7 6 5 4 3 2 1 0
10:20 %>% mean()
seq(from = 0, to = 20)
#>  [1]  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
seq(from = 0, to = 20, by = 2)
#>  [1]  0  2  4  6  8 10 12 14 16 18 20
seq(from = 0, to = 20, by = 5)
#> [1]  0  5 10 15 20
seq(from = 0, to = 20, length.out = 5)
#> [1]  0  5 10 15 20
seq(from = 0, to = 100, length.out = 5)
#> [1]   0  25  50  75 100
seq(from = 1.0, to = 2.0, length.out = 5)
#> [1] 1.00 1.25 1.50 1.75 2.00
rep(pi, times = 5)
#> [1] 3.14 3.14 3.14 3.14 3.14
a <- 3
a == pi # Test for equality
#> [1] FALSE
a != pi # Test for inequality
#> [1] TRUE
a < pi
#> [1] TRUE
a > pi
#> [1] FALSE
a <= pi
#> [1] TRUE
a >= pi
#> [1] FALSE
v <- c(3, pi, 4)
w <- c(pi, pi, pi)
v == w # Compare two 3-element vectors
#> [1] FALSE  TRUE FALSE
v != w
#> [1]  TRUE FALSE  TRUE
v < w
#> [1]  TRUE FALSE FALSE
v <= w
#> [1]  TRUE  TRUE FALSE
v > w
#> [1] FALSE FALSE  TRUE
v >= w
#> [1] FALSE  TRUE  TRUE
v <- c(3, pi, 4)
v == pi # Compare a 3-element vector against one number
#> [1] FALSE  TRUE FALSE
v != pi
#> [1]  TRUE FALSE  TRUE
v <- c(3, pi, 4)
any(v == pi) # Return TRUE if any element of v equals pi
#> [1] TRUE
all(v == 0) # Return TRUE if all elements of v are zero
#> [1] FALSE
fib <- c(0, 1, 1, 2, 3, 5, 8, 13, 21, 34)
fib
#>  [1]  0  1  1  2  3  5  8 13 21 34
fib[1]
#> [1] 0
fib[2]
#> [1] 1
fib[3]
#> [1] 1
fib[4]
#> [1] 2
fib[5]
#> [1] 3
fib[1:3] # Select elements 1 through 3
#> [1] 0 1 1
fib[4:9] # Select elements 4 through 9
#> [1]  2  3  5  8 13 21
fib[c(1, 2, 4, 8)]
#> [1]  0  1  2 13
fib[-1] # Ignore first element
#> [1]  1  1  2  3  5  8 13 21 34
fib[1:3] # As before
#> [1] 0 1 1
fib[-(1:3)] # Invert sign of index to exclude instead of select
#> [1]  2  3  5  8 13 21 34
fib < 10 # This vector is TRUE wherever fib is less than 10
#>  [1]  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE FALSE
fib[fib < 10] # Use that vector to select elements less than 10
#> [1] 0 1 1 2 3 5 8
fib %% 2 == 0 # This vector is TRUE wherever fib is even
#>  [1]  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE FALSE  TRUE
fib[fib %% 2 == 0] # Use that vector to select the even elements
#> [1]  0  2  8 34
v <- c(3, 6, 1, 9, 11, 16, 0, 3, 1, 45, 2, 8, 9, 6, -4)
v[ v > median(v)]
#> [1]  9 11 16 45  8  9
v[ (v < quantile(v, 0.05)) | (v > quantile(v, 0.95)) ]
#> [1] 45 -4
v[ abs(v - mean(v)) > sd(v)]
#> [1] 45 -4
v <- c(1, 2, 3, NA, 5)
v[!is.na(v) & !is.null(v)]
#> [1] 1 2 3 5
years <- c(1960, 1964, 1976, 1994)
names(years) <- c("Kennedy", "Johnson", "Carter", "Clinton")
years
#> Kennedy Johnson  Carter Clinton
#>    1960    1964    1976    1994
years["Carter"]
#> Carter
#>   1976
years["Clinton"]
#> Clinton
#>    1994
years[c("Carter", "Clinton")]
#>  Carter Clinton
#>    1976    1994
v <- c(11, 12, 13, 14, 15)
w <- c(1, 2, 3, 4, 5)
v + w
#> [1] 12 14 16 18 20
v - w
#> [1] 10 10 10 10 10
v * w
#> [1] 11 24 39 56 75
v / w
#> [1] 11.00  6.00  4.33  3.50  3.00
w^v
#> [1] 1.00e+00 4.10e+03 1.59e+06 2.68e+08 3.05e+10
w
#> [1] 1 2 3 4 5
w + 2
#> [1] 3 4 5 6 7
w - 2
#> [1] -1  0  1  2  3
w * 2
#> [1]  2  4  6  8 10
w / 2
#> [1] 0.5 1.0 1.5 2.0 2.5
2^w
#> [1]  2  4  8 16 32
w
#> [1] 1 2 3 4 5
mean(w)
#> [1] 3
w - mean(w)
#> [1] -2 -1  0  1  2
w
#> [1] 1 2 3 4 5
sd(w)
#> [1] 1.58
(w - mean(w)) / sd(w)
#> [1] -1.265 -0.632  0.000  0.632  1.265
w <- 1:5
w
#> [1] 1 2 3 4 5
sqrt(w)
#> [1] 1.00 1.41 1.73 2.00 2.24
log(w)
#> [1] 0.000 0.693 1.099 1.386 1.609
sin(w)
#> [1]  0.841  0.909  0.141 -0.757 -0.959
n <- 10
0:n - 1
#>  [1] -1  0  1  2  3  4  5  6  7  8  9
library(tidyverse)
data(mpg)

mpg %>%
  filter(cty > 21) %>%
  head(3) %>%
  print()
#> # A tibble: 3 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 chevrolet    mali~   2.4  2008     4 auto~ f        22    30 r     mids~
#> 2 honda        civic   1.6  1999     4 manu~ f        28    33 r     subc~
#> 3 honda        civic   1.6  1999     4 auto~ f        24    32 r     subc~
temp1 <- filter(mpg, cty > 21)
temp2 <- head(temp1, 3)
print(temp2)
#> # A tibble: 3 x 11
#>   manufacturer model displ  year   cyl trans drv     cty   hwy fl    class
#>   <chr>        <chr> <dbl> <int> <int> <chr> <chr> <int> <int> <chr> <chr>
#> 1 chevrolet    mali~   2.4  2008     4 auto~ f        22    30 r     mids~
#> 2 honda        civic   1.6  1999     4 manu~ f        28    33 r     subc~
#> 3 honda        civic   1.6  1999     4 auto~ f        24    32 r     subc~
x %>% head()
head(x)
x %>% head(n = 10)

head(x, n = 10)
library(tidyverse)

filtered_mpg <- filter(mpg, cty > 21)
selected_mpg <- select(filtered_mpg, cty, hwy)
ggplot(selected_mpg, aes(cty, hwy)) + geom_point()
ggplot(select(filter(mpg, cty > 21), cty, hwy), aes(cty, hwy)) + geom_point()
mpg %>%
  filter(cty > 21) %>%
  select(cty, hwy) %>%
  ggplot(aes(cty, hwy)) + geom_point()
iris %>% head(3)
iris %>% head(3, x = .)
ls()
ls

# > function (name, pos = -1L, envir = as.environment(pos), all.names = FALSE,
# >     pattern, sorted = TRUE)
# > {
# >     if (!missing(name)) {
# >         pos <- tryCatch(name, error = function(e) e)
# >         if (inherits(pos, "error")) {
# >             name <- substitute(name)
# >             if (!is.character(name))
# >                 name <- deparse(name)
# > etc...
tbl <- read.csv("F:\research\bio\assay.csv")
x <- pi # Set x to 3.1415926...
x < -pi # Oops! We are comparing x instead of setting it!
#> [1] FALSE
x < -pi
#> Error in eval(expr, envir, enclos): object 'x' not found
x <- 0 # Initialize x to zero
x < -pi # Oops!
#> [1] FALSE
total <- 1 + 2 + 3 + # Continued on the next line
  4 + 5
print(total)
#> [1] 15
total <- 1 + 2 + 3 # Oops! R sees a complete expression
+4 + 5 # This is a new expression; R prints its value
#> [1] 9
print(total)
#> [1] 6
v <- 1 # Assign 1 to v
v == 0 # Compare v against zero
#> [1] FALSE
v <- 0 # Assign 0 to v, overwriting previous contents
n <- 5
1:n + 1
#> [1] 2 3 4 5 6
1:(n + 1)
#> [1] 1 2 3 4 5 6
x <- rnorm(100)
n <- 5
truehist(x, n)
#> Error in truehist(x, n): could not find function "truehist"
library(MASS) # Load the MASS package into R
truehist(x, n)
Error: '\m' is an unrecognized escape in character string starting "'.\temp\m"
read_csv(`./temp/my_file.csv`)
read_csv(`.\\temp\\my_file.csv`)
getwd()
#> [1] "/Users/jal/DocumentsPersonal/R-Cookbook"
setwd("~/Documents/MyDirectory")
save.image()
history()
history(100)          # Show 100 most recent lines of history
history(Inf)          # Show entire saved history
aVeryLongRunningFunction()  # Oops! Forgot to save the result!
x <- .Last.value            # Capture the result now
search()
search()
#>  [1] ".GlobalEnv"        "package:knitr"     "package:forcats"
#>  [4] "package:stringr"   "package:dplyr"     "package:purrr"
#>  [7] "package:readr"     "package:tidyr"     "package:tibble"
#> [10] "package:ggplot2"   "package:tidyverse" "package:stats"
#> [13] "package:graphics"  "package:grDevices" "package:utils"
#> [16] "package:datasets"  "package:methods"   "Autoloads"
#> [19] "package:base"
library(packagename)
lda(x)
#> Error in lda(x): could not find function "lda"
my_model <- lda(cty ~ displ + year, data = mpg)
#> Error in lda(cty ~ displ + year, data = mpg): could not find function "lda"

library(MASS)                          # Load the MASS library into memory
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#>     select
my_model <-
  lda(cty ~ displ + year, data = mpg)  # Now R can find the function
detach(package:MASS)
data(dsname, package = "pkgname")
head(pressure)
#>   temperature pressure
#> 1           0   0.0002
#> 2          20   0.0012
#> 3          40   0.0060
#> 4          60   0.0300
#> 5          80   0.0900
#> 6         100   0.2700
help(pressure)      # Bring up help page for pressure dataset
data()              # Bring up a list of datasets
data(Cars93, package = "MASS")
data(package = "pkgname")
library()
installed.packages()[1:5, c("Package", "Version")]
#>            Package      Version
#> abind      "abind"      "1.4-5"
#> ade4       "ade4"       "1.7-13"
#> adegenet   "adegenet"   "2.1.1"
#> ape        "ape"        "5.2"
#> assertthat "assertthat" "0.2.0"
install.packages("packagename")
install.packages("packagename", lib = "~/lib/R")
install_github("thomasp85/tidygraph")
choosessmirror()
R will present a list of CRAN mirrors.
options("repos")[[1]][1]
options(repos = c(CRAN = "http://cran.rstudio.com"))
source("myScript.R")
print("Hello, World!")
source("hello.R")
#> [1] "Hello, World!"
source("hello.R", echo = TRUE)
#>
#> > print("Hello, World!")
#> [1] "Hello, World!"
R CMD BATCH scriptfile outputfile
Rscript scriptfile arg1 arg2 arg3
R CMD BATCH --quiet myScript.R results.out
Rscript myScript.R arg1 arg2 arg3
argv <- commandArgs(TRUE)
Rscript --slave myScript.R arg1 arg2 arg3 >results.out
argv <- commandArgs(TRUE)
x <- as.numeric(argv[1])
y <- as.numeric(argv[2])

cat("x =", x, "\n")
cat("y =", y, "\n")
cat("x + y = ", x + y, "\n")
cat("x - y = ", x - y, "\n")
cat("x * y = ", x * y, "\n")
cat("x / y = ", x / y, "\n")
Rscript arith.R 2 3.1415
x = 2
y = 3.1415
x + y = 5.1415
x - y = -1.1415
x * y = 6.283
x / y = 0.6366385
#!/usr/bin/Rscript --slave

argv <- commandArgs(TRUE)
x <- as.numeric(argv[1])
.
. (etc.)
.
chmod +x arith.R
arith.R 2 3.1415
Sys.getenv("R_HOME")
#> [1] "/Library/Frameworks/R.framework/Resources"
> Sys.getenv("R_HOME")
[1] "/Library/Frameworks/R.framework/Resources"
> Sys.getenv("R_HOME")
[1] "/usr/lib/R"
R RHOME
# /usr/lib/R
Sys.setenv(DB_USERID = "my_id")
Sys.setenv(DB_PASSWORD = "My_Password!")
options(prompt = "R> ")
source("~/.Rprofile")
#
# ... remainder of local .Rprofile...
#
help(options)
scores <- c(61, 66, 90, 88, 100)
points <- data.frame(
  label = c("Low", "Mid", "High"),
  lbound = c(0, 0.67,   1.64),
  ubound = c(0.67, 1.64,   2.33)
)
pi
#> [1] 3.14
100 * pi
#> [1] 314
print(pi, digits = 4)
#> [1] 3.142
print(100 * pi, digits = 4)
#> [1] 314.2
cat(pi, "\n")
#> 3.14
cat(format(pi, digits = 4), "\n")
#> 3.142
pnorm(-3:3)
#> [1] 0.00135 0.02275 0.15866 0.50000 0.84134 0.97725 0.99865
print(pnorm(-3:3), digits = 3)
#> [1] 0.00135 0.02275 0.15866 0.50000 0.84134 0.97725 0.99865
q <- seq(from = 0, to = 3, by = 0.5)
tbl <- data.frame(Quant = q,
                  Lower = pnorm(-q),
                  Upper = pnorm(q))
tbl                                # Unformatted print
#>   Quant   Lower Upper
#> 1   0.0 0.50000 0.500
#> 2   0.5 0.30854 0.691
#> 3   1.0 0.15866 0.841
#> 4   1.5 0.06681 0.933
#> 5   2.0 0.02275 0.977
#> 6   2.5 0.00621 0.994
#> 7   3.0 0.00135 0.999
print(tbl, digits = 2)             # Formatted print: fewer digits
#>   Quant  Lower Upper
#> 1   0.0 0.5000  0.50
#> 2   0.5 0.3085  0.69
#> 3   1.0 0.1587  0.84
#> 4   1.5 0.0668  0.93
#> 5   2.0 0.0228  0.98
#> 6   2.5 0.0062  0.99
#> 7   3.0 0.0013  1.00
pi
#> [1] 3.14
options(digits = 15)
pi
#> [1] 3.14159265358979
cat("The answer is", answer, "\n", file = "filename.txt")
sink("filename")          # Begin writing output to file

# ... other session work ...

sink()                    # Resume writing output to console
sink("script_output.txt")   # Redirect output to file
source("script.R")          # Run the script, capturing its output
sink()                      # Resume writing output to console
cat(data, file = "analysisReport.out")
cat(results, file = "analysisRepart.out", append = TRUE)
cat(conclusion, file = "analysisReport.out", append = TRUE)
con <- file("analysisReport.out", "w")
cat(data, file = con)
cat(results, file = con)
cat(conclusion, file = con)
close(con)
list.files()
#>  [1] "_book"                            "_bookdown_files"
#>  [3] "_bookdown_files.old"              "_bookdown.yml"
#>  [5] "_common.R"                        "_main.rds"
#>  [7] "_output.yaml"                     "01_GettingStarted_cache"
#>  [9] "01_GettingStarted.md"             "01_GettingStarted.Rmd"
etc ...
list.files(path = 'data/') # show files in a directory
#>  [1] "ac.rdata"               "adf.rdata"
#>  [3] "anova.rdata"            "anova2.rdata"
#>  [5] "bad.rdata"              "batches.rdata"
#>  [7] "bnd_cmty.Rdata"         "compositePerf-2010.csv"
#>  [9] "conf.rdata"             "daily.prod.rdata"
#> [11] "data1.csv"              "data2.csv"
#> [13] "datafile_missing.tsv"   "datafile.csv"
#> [15] "datafile.fwf"           "datafile.qsv"
#> [17] "datafile.ssv"           "datafile.tsv"
#> [19] "df_decay.rdata"         "df_squared.rdata"
#> [21] "diffs.rdata"            "example1_headless.csv"
#> [23] "example1.csv"           "excel_table_data.xlsx"
#> [25] "get_USDA_NASS_data.R"   "ibm.rdata"
#> [27] "iris_excel.xlsx"        "lab_df.rdata"
#> [29] "movies.sas7bdat"        "nacho_data.csv"
#> [31] "NearestPoint.R"         "not_a_csv.txt"
#> [33] "opt.rdata"              "outcome.rdata"
#> [35] "pca.rdata"              "pred.rdata"
#> [37] "pred2.rdata"            "sat.rdata"
#> [39] "singles.txt"            "state_corn_yield.rds"
#> [41] "student_data.rdata"     "suburbs.txt"
#> [43] "tab1.csv"               "tls.rdata"
#> [45] "triples.txt"            "ts_acf.rdata"
#> [47] "workers.rdata"          "world_series.csv"
#> [49] "xy.rdata"               "yield.Rdata"
#> [51] "z.RData"
list.files(path = 'data/', pattern = '\\.csv')
#> [1] "compositePerf-2010.csv" "data1.csv"
#> [3] "data2.csv"              "datafile.csv"
#> [5] "example1_headless.csv"  "example1.csv"
#> [7] "nacho_data.csv"         "tab1.csv"
#> [9] "world_series.csv"
list.files(recursive = T)
list.files(path = 'data/', all.files = TRUE)
#>  [1] "."                      ".."
#>  [3] ".DS_Store"              ".hidden_file.txt"
#>  [5] "ac.rdata"               "adf.rdata"
#>  [7] "anova.rdata"            "anova2.rdata"
#>  [9] "bad.rdata"              "batches.rdata"
#> [11] "bnd_cmty.Rdata"         "compositePerf-2010.csv"
#> [13] "conf.rdata"             "daily.prod.rdata"
#> [15] "data1.csv"              "data2.csv"
#> [17] "datafile_missing.tsv"   "datafile.csv"
#> [19] "datafile.fwf"           "datafile.qsv"
#> [21] "datafile.ssv"           "datafile.tsv"
#> [23] "df_decay.rdata"         "df_squared.rdata"
#> [25] "diffs.rdata"            "example1_headless.csv"
#> [27] "example1.csv"           "excel_table_data.xlsx"
#> [29] "get_USDA_NASS_data.R"   "ibm.rdata"
#> [31] "iris_excel.xlsx"        "lab_df.rdata"
#> [33] "movies.sas7bdat"        "nacho_data.csv"
#> [35] "NearestPoint.R"         "not_a_csv.txt"
#> [37] "opt.rdata"              "outcome.rdata"
#> [39] "pca.rdata"              "pred.rdata"
#> [41] "pred2.rdata"            "sat.rdata"
#> [43] "singles.txt"            "state_corn_yield.rds"
#> [45] "student_data.rdata"     "suburbs.txt"
#> [47] "tab1.csv"               "tls.rdata"
#> [49] "triples.txt"            "ts_acf.rdata"
#> [51] "workers.rdata"          "world_series.csv"
#> [53] "xy.rdata"               "yield.Rdata"
#> [55] "z.RData"
samp <- read_csv("C:\Data\sample-data.csv")
#> Error: '\D' is an unrecognized escape in character string starting ""C:\D"
samp <- read_csv("C:/Data/sample-data.csv")
samp <- read_csv("C:\\Data\\sample-data.csv")
library(tidyverse)
records <- read_fwf("./data/datafile.fwf",
                    fwf_cols(
                      last = 10,
                      first = 10,
                      birth = 5,
                      death = 5
                    ))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
records
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
Fisher    R.A.      1890 1962
Pearson   Karl      1857 1936
Cox       Gertrude  1900 1978
Yates     Frank     1902 1994
Smith     Kirstine  1878 1939
file <- "./data/datafile.fwf"
t1 <- read_fwf(file, fwf_empty(file, col_names = c("last", "first", "birth", "death")))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t2 <- read_fwf(file, fwf_widths(c(10, 10, 5, 4),
                                c("last", "first", "birth", "death")))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t3 <-
  read_fwf("./data/datafile.fwf",
           fwf_cols(
             last = 10,
             first = 10,
             birth = 5,
             death = 5
           ))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t4 <- read_fwf(file, fwf_cols(
  last = c(1, 10),
  first = c(11, 20),
  birth = c(21, 25),
  death = c(26, 30)
))
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t5 <- read_fwf(file, fwf_positions(
  c(1, 11, 21, 26),
  c(10, 20, 25, 30),
  c("first", "last", "birth", "death")
))
#> Parsed with column specification:
#> cols(
#>   first = col_character(),
#>   last = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
library(tidyverse)

tab1 <- read_table2("./data/datafile.tsv")
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
tab1
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
last    first   birth   death
Fisher  R.A.    1890    1962
Pearson Karl    1857    1936
Cox Gertrude    1900    1978
Yates   Frank   1902    1994
Smith   Kirstine    1878    1939
t <- read_table2("./data/datafile.ssv")
#> Parsed with column specification:
#> cols(
#>   `#The` = col_character(),
#>   following = col_character(),
#>   is = col_character(),
#>   a = col_character(),
#>   list = col_character(),
#>   of = col_character(),
#>   statisticians = col_character()
#> )
#> Warning: 6 parsing failures.
#> row col  expected    actual                  file
#>   1  -- 7 columns 4 columns './data/datafile.ssv'
#>   2  -- 7 columns 4 columns './data/datafile.ssv'
#>   3  -- 7 columns 4 columns './data/datafile.ssv'
#>   4  -- 7 columns 4 columns './data/datafile.ssv'
#>   5  -- 7 columns 4 columns './data/datafile.ssv'
#> ... ... ......... ......... .....................
#> See problems(...) for more details.
print(t)
#> # A tibble: 6 x 7
#>   `#The`  following is    a     list  of    statisticians
#>   <chr>   <chr>     <chr> <chr> <chr> <chr> <chr>
#> 1 last    first     birth death <NA>  <NA>  <NA>
#> 2 Fisher  R.A.      1890  1962  <NA>  <NA>  <NA>
#> 3 Pearson Karl      1857  1936  <NA>  <NA>  <NA>
#> 4 Cox     Gertrude  1900  1978  <NA>  <NA>  <NA>
#> 5 Yates   Frank     1902  1994  <NA>  <NA>  <NA>
#> 6 Smith   Kirstine  1878  1939  <NA>  <NA>  <NA>
t <-
  read_table2(
    "./data/datafile.tsv",
    col_types = c(
      col_character(),
      col_character(),
      col_integer(),
      col_integer()
    )
  )
last    first   birth   death
Fisher  R.A.    1890    1962
Pearson Karl    1857    1936
Cox Gertrude    1900    1978
Yates   Frank   1902    1994
Smith   Kirstine    1878    1939
Cox David 1924 .
t <- read_table2("./data/datafile_missing.tsv", na = ".")
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t
#> # A tibble: 6 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
#> 6 Cox     David     1924    NA
# The following is a list of statisticians
last first birth death
Fisher R.A. 1890 1962
Pearson Karl 1857 1936
Cox Gertrude 1900 1978
Yates Frank 1902 1994
Smith Kirstine 1878 1939
t <- read_table2("./data/datafile.ssv", comment = '#')
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
t
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
library(tidyverse)

tbl <- read_csv("./data/datafile.csv")
#> Parsed with column specification:
#> cols(
#>   last = col_character(),
#>   first = col_character(),
#>   birth = col_double(),
#>   death = col_double()
#> )
tbl <- read_csv("./data/datafile.csv",  col_names = FALSE)
#> Parsed with column specification:
#> cols(
#>   X1 = col_character(),
#>   X2 = col_character(),
#>   X3 = col_character(),
#>   X4 = col_character()
#> )
label,lbound,ubound
low,0,0.674
mid,0.674,1.64
high,1.64,2.33
tbl <- read_csv("./data/example1.csv")
#> Parsed with column specification:
#> cols(
#>   label = col_character(),
#>   lbound = col_double(),
#>   ubound = col_double()
#> )
tbl
#> # A tibble: 3 x 3
#>   label lbound ubound
#>   <chr>  <dbl>  <dbl>
#> 1 low    0      0.674
#> 2 mid    0.674  1.64
#> 3 high   1.64   2.33
tbl <- read_csv("./data/example1.csv", col_names = FALSE)
#> Parsed with column specification:
#> cols(
#>   X1 = col_character(),
#>   X2 = col_character(),
#>   X3 = col_character()
#> )
tbl
#> # A tibble: 4 x 3
#>   X1    X2     X3
#>   <chr> <chr>  <chr>
#> 1 label lbound ubound
#> 2 low   0      0.674
#> 3 mid   0.674  1.64
#> 4 high  1.64   2.33
library(tidyverse)

write_csv(tab1, path = "./data/tab1.csv")
library(tidyverse)

print(tab1)
#> # A tibble: 5 x 4
#>   last    first    birth death
#>   <chr>   <chr>    <dbl> <dbl>
#> 1 Fisher  R.A.      1890  1962
#> 2 Pearson Karl      1857  1936
#> 3 Cox     Gertrude  1900  1978
#> 4 Yates   Frank     1902  1994
#> 5 Smith   Kirstine  1878  1939
write_csv(tab1, "./data/tab1.csv")
last,first,birth,death
Fisher,R.A.,1890,1962
Pearson,Karl,1857,1936
Cox,Gertrude,1900,1978
Yates,Frank,1902,1994
Smith,Kirstine,1878,1939
library(tidyverse)

berkley <- read_csv('http://bit.ly/barkley18', comment = '#')
#> Parsed with column specification:
#> cols(
#>   Name = col_character(),
#>   Location = col_character(),
#>   Time = col_time(format = "")
#> )
tbl <- read_table2("ftp://ftp.example.com/download/data.txt")
library(openxlsx)

df1 <- read.xlsx(xlsxFile = "data/iris_excel.xlsx",
                 sheet = 'iris_data')
head(df1, 3)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
library(openxlsx)
wb <- loadWorkbook("data/excel_table_data.xlsx")
tables <- getTables(wb, 'input_data')
table_range_str <- names(tables[tables == 'example_table'])
table_range_refs <- strsplit(table_range_str, ':')[[1]]

# use a regex to extract out the row numbers
table_range_row_num <- gsub("[^0-9.]", "", table_range_refs)

# extract out the column numbers
table_range_col_num <- convertFromExcelRef(table_range_refs)
df <- read.xlsx(
  xlsxFile = "data/excel_table_data.xlsx",
  sheet = 'input_data',
  cols = table_range_col_num[1]:table_range_col_num[2],
  rows = table_range_row_num[1]:table_range_row_num[2]
)
library(openxlsx)

write.xlsx(x = iris,
           sheetName = 'iris_data',
           file = "data/iris_excel.xlsx")
library(openxlsx)

wb <- loadWorkbook("data/excel_table_data.xlsx")
tables <- getTables(wb, 'input_data')
table_range_str <- names(tables[tables == 'example_table'])
table_range_refs <- strsplit(table_range_str, ':')[[1]]

# use a regex to extract out the starting row number
table_row_num <- gsub("[^0-9.]", "", table_range_refs)[[1]]

# extract out the starting column number
table_col_num <- convertFromExcelRef(table_range_refs)[[1]]
## remove the existing Excel Table
removeTable(wb = wb,
            sheet = 'input_data',
            table = 'example_table')
writeDataTable(
  wb = wb,
  sheet = 'input_data',
  x = iris,
  startCol = table_col_num,
  startRow = table_row_num,
  tableStyle = "TableStyleLight9",
  tableName = 'example_table'
)
writeData(
  wb = wb,
  sheet = 'input_data',
  x = paste('example_table data refreshed on:', Sys.time()),
  startCol = 2,
  startRow = 5
)

## then save the workbook
saveWorkbook(wb = wb,
             file = "data/excel_table_data.xlsx",
             overwrite = T)
library(haven)

sas_movie_data <- read_sas("data/movies.sas7bdat")
sapply(sas_movie_data, attributes)
#> $Movie
#> $Movie$label
#> [1] "Movie"
#>
#>
#> $Type
#> $Type$label
#> [1] "Type"
#>
#>
#> $Rating
#> $Rating$label
#> [1] "Rating"
#>
#>
#> $Year
#> $Year$label
#> [1] "Year"
#>
#>
#> $Domestic__
#> $Domestic__$label
#> [1] "Domestic $"
#>
#> $Domestic__$format.sas
#> [1] "F"
#>
#>
#> $Worldwide__
#> $Worldwide__$label
#> [1] "Worldwide $"
#>
#> $Worldwide__$format.sas
#> [1] "F"
#>
#>
#> $Director
#> $Director$label
#> [1] "Director"
library(rvest)
library(magrittr)

all_tables <-
  read_html("https://en.wikipedia.org/wiki/Aviation_accidents_and_incidents") %>%
  html_table(fill = TRUE, header = TRUE)
out_table <-
  read_html("https://en.wikipedia.org/wiki/Aviation_accidents_and_incidents") %>%
  html_table(fill = TRUE, header = TRUE) %>%
  extract2(2)

head(out_table)
#>   Year Deaths[52] # of incidents[53]
#> 1 2017        399           101 [54]
#> 2 2016        629                102
#> 3 2015        898                123
#> 4 2014      1,328                122
#> 5 2013        459                138
#> 6 2012        800                156
url <- 'http://en.wikipedia.org/wiki/World_population'
tbls <- read_html(url) %>%
  html_table(fill = TRUE, header = TRUE)
length(tbls)
#> [1] 23
library(magrittr)
url <- 'http://en.wikipedia.org/wiki/World_population'
tbl <- read_html(url) %>%
  html_table(fill = TRUE, header = TRUE) %>%
  extract2(2)

head(tbl, 2)
#>   World population (millions, UN estimates)[10]
#> 1                                             #
#> 2                                             1
#>   World population (millions, UN estimates)[10]
#> 1               Top ten most populous countries
#> 2                                        China*
#>   World population (millions, UN estimates)[10]
#> 1                                          2000
#> 2                                         1,270
#>   World population (millions, UN estimates)[10]
#> 1                                          2015
#> 2                                         1,376
#>   World population (millions, UN estimates)[10]
#> 1                                         2030*
#> 2                                         1,416
tbl[, c(2, 3)]
#>                          World population (millions, UN estimates)[10]
#> 1                                      Top ten most populous countries
#> 2                                                               China*
#> 3                                                                India
#> 4                                                        United States
#> 5                                                            Indonesia
#> 6                                                             Pakistan
#> 7                                                               Brazil
#> 8                                                              Nigeria
#> 9                                                           Bangladesh
#> 10                                                              Russia
#> 11                                                              Mexico
#> 12                                                         World total
#> 13 Notes:\nChina = excludes Hong Kong and Macau\n2030 = Medium variant
#>                        World population (millions, UN estimates)[10].1
#> 1                                                                 2000
#> 2                                                                1,270
#> 3                                                                1,053
#> 4                                                                  283
#> 5                                                                  212
#> 6                                                                  136
#> 7                                                                  176
#> 8                                                                  123
#> 9                                                                  131
#> 10                                                                 146
#> 11                                                                 103
#> 12                                                               6,127
#> 13 Notes:\nChina = excludes Hong Kong and Macau\n2030 = Medium variant
lines <- readLines("input.txt")
lines <- readLines("input.txt", n = 10)       # Read 10 lines and stop
2355.09 2246.73 1738.74 1841.01 2027.85
singles <- scan("./data/singles.txt", what = numeric(0))
singles
#> [1] 2355.09 2246.73 1738.74 1841.01 2027.85
15-Oct-87 2439.78 2345.63 16-Oct-87 2396.21 2207.73
19-Oct-87 2164.16 1677.55 20-Oct-87 2067.47 1616.21
21-Oct-87 2081.07 1951.76
triples <-
  scan("./data/triples.txt",
       what = list(character(0), numeric(0), numeric(0)))
triples
#> [[1]]
#> [1] "15-Oct-87" "16-Oct-87" "19-Oct-87" "20-Oct-87" "21-Oct-87"
#>
#> [[2]]
#> [1] 2439.78 2396.21 2164.16 2067.47 2081.07
#>
#> [[3]]
#> [1] 2345.63 2207.73 1677.55 1616.21 1951.76
triples <- scan("./data/triples.txt",
                what = list(
                  date = character(0),
                  high = numeric(0),
                  low = numeric(0)
                ))
triples
#> $date
#> [1] "15-Oct-87" "16-Oct-87" "19-Oct-87" "20-Oct-87" "21-Oct-87"
#>
#> $high
#> [1] 2439.78 2396.21 2164.16 2067.47 2081.07
#>
#> $low
#> [1] 2345.63 2207.73 1677.55 1616.21 1951.76
df_triples <- data.frame(triples)
df_triples
#>        date    high     low
#> 1 15-Oct-87 2439.78 2345.63
#> 2 16-Oct-87 2396.21 2207.73
#> 3 19-Oct-87 2164.16 1677.55
#> 4 20-Oct-87 2067.47 1616.21
#> 5 21-Oct-87 2081.07 1951.76
1903  LWLlwwwW    1927  wwWW      1950  wwWW      1973  WLwllWW
1905  wLwWW       1928  WWww      1951  LWlwwW    1974  wlWWW
1906  wLwLwW      1929  wwLWW     1952  lwLWLww   1975  lwWLWlw
1907  WWww        1930  WWllwW    1953  WWllwW    1976  WWww
1908  wWLww       1931  LWwlwLW   1954  WWww      1977  WLwwlW

.
. (etc.)
.
# Read the wseries dataset:
#     - Skip the first 35 lines
#     - Then read 23 lines of data
#     - The data occurs in pairs: a year and a pattern (char string)
#
world.series <- scan(
  "http://lib.stat.cmu.edu/datasets/wseries",
  skip = 35,
  nlines = 23,
  what = list(year = integer(0),
              pattern = character(0)),
)
world.series$year
#>  [1] 1903 1927 1950 1973 1905 1928 1951 1974 1906 1929 1952 1975 1907 1930
#> [15] 1953 1976 1908 1931 1954 1977 1909 1932 1955 1978 1910 1933 1956 1979
#> [29] 1911 1934 1957 1980 1912 1935 1958 1981 1913 1936 1959 1982 1914 1937
#> [43] 1960 1983 1915 1938 1961 1984 1916 1939 1962 1985 1917 1940 1963 1986
#> [57] 1918 1941 1964 1987 1919 1942 1965 1988 1920 1943 1966 1989 1921 1944
#> [71] 1967 1990 1922 1945 1968 1991 1923 1946 1969 1992 1924 1947 1970 1993
#> [85] 1925 1948 1971 1926 1949 1972
perm <- order(world.series$year)
world.series <- list(year    = world.series$year[perm],
                     pattern = world.series$pattern[perm])
world.series$year
#>  [1] 1903 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917
#> [15] 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931
#> [29] 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945
#> [43] 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959
#> [57] 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973
#> [71] 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987
#> [85] 1988 1989 1990 1991 1992 1993

world.series$pattern
#>  [1] "LWLlwwwW" "wLwWW"    "wLwLwW"   "WWww"     "wWLww"    "WLwlWlw"
#>  [7] "WWwlw"    "lWwWlW"   "wLwWlLW"  "wLwWw"    "wwWW"     "lwWWw"
#> [13] "WWlwW"    "WWllWw"   "wlwWLW"   "WWlwwLLw" "wllWWWW"  "LlWwLwWw"
#> [19] "WWwW"     "LwLwWw"   "LWlwlWW"  "LWllwWW"  "lwWLLww"  "wwWW"
#> [25] "WWww"     "wwLWW"    "WWllwW"   "LWwlwLW"  "WWww"     "WWlww"
#> [31] "wlWLLww"  "LWwwlW"   "lwWWLw"   "WWwlw"    "wwWW"     "WWww"
#> [37] "LWlwlWW"  "WLwww"    "LWwww"    "WLWww"    "LWlwwW"   "LWLwwlw"
#> [43] "LWlwlww"  "WWllwLW"  "lwWWLw"   "WLwww"    "wwWW"     "LWlwwW"
#> [49] "lwLWLww"  "WWllwW"   "WWww"     "llWWWlw"  "llWWWlw"  "lwLWWlw"
#> [55] "llWLWww"  "lwWWLw"   "WLlwwLW"  "WLwww"    "wlWLWlw"  "wwWW"
#> [61] "WLlwwLW"  "llWWWlw"  "wwWW"     "wlWWLlw"  "lwLLWww"  "lwWWW"
#> [67] "wwWLW"    "llWWWlw"  "wwLWLlw"  "WLwllWW"  "wlWWW"    "lwWLWlw"
#> [73] "WWww"     "WLwwlW"   "llWWWw"   "lwLLWww"  "WWllwW"   "llWWWw"
#> [79] "LWwllWW"  "LWwww"    "wlWWW"    "LLwlwWW"  "LLwwlWW"  "WWlllWW"
#> [85] "WWlww"    "WWww"     "WWww"     "WWlllWW"  "lwWWLw"   "WLwwlW"
library(RMySQL)

con <- dbConnect(
    drv = RMySQL::MySQL(),
    dbname = "your_db_name",
    host = "your.host.com",
    username = "userid",
    password = "pwd"
  )
[client]
user = userid
password = password
host = hostname
con <- dbConnect(dbConnect(
  drv = RMySQL::MySQL(),
  dbname = "your_db_name",
  host = "your.host.com"
)
sql <- "SELECT * from SurveyResults WHERE City = 'Chicago'"
rows <- dbGetQuery(con, sql)
if (dbMoreResults(con)) dbNextResult(con)
dbDisconnect(con)
con <- dbConnect(MySQL(), client.flag = CLIENT_MULTI_RESULTS)
sql <- paste(
  "select * from DailyBar where Symbol = 'IBM'",
  "and Day between '2008-12-29' and '2008-12-31'"
)
rows <- dbGetQuery(con, sql)
if (dbMoreResults(con)) {
  dbNextResults(con)
}
dbDisconnect(con)
print(rows)
con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
sleep_db <- copy_to(con, msleep, "sleep")
sleep_table <- tbl(con, "sleep")
little_sleep <- sleep_table %>%
  select(name, genus, order, sleep_total) %>%
  filter(sleep_total < 3)
show_query(little_sleep)
#> <SQL>
#> SELECT *
#> FROM (SELECT `name`, `genus`, `order`, `sleep_total`
#> FROM `sleep`)
#> WHERE (`sleep_total` < 3.0)
local_little_sleep <- collect(little_sleep)
local_little_sleep
#> # A tibble: 3 x 4
#>   name        genus         order          sleep_total
#>   <chr>       <chr>         <chr>                <dbl>
#> 1 Horse       Equus         Perissodactyla         2.9
#> 2 Giraffe     Giraffa       Artiodactyla           1.9
#> 3 Pilot whale Globicephalus Cetacea                2.7
save(tbl, t, file = "myData.RData")
load("myData.RData")
dput(tbl, file = "myData.txt")
dump("tbl", file = "myData.txt")    # Note quotes around variable name
myData <- load("myData.RData")     # Achtung! Might not do what you think
myData
#> [1] "tbl" "t"
str(myData)
#>  chr [1:2] "tbl" "t"
load("./data/z.RData")   # Create and populate the z variable
plot(z)                  # Does not plot as expected: zoo pkg not loaded
library(zoo)                  # Load the zoo package into memory
load("./data/z.RData") # Create and populate the z variable
plot(z)                       # Ahhh. Now plotting works correctly
v <- c(10, 20, 30)
names(v) <- c("Moe", "Larry", "Curly")
print(v)
#>   Moe Larry Curly
#>    10    20    30
v["Larry"]
#> Larry
#>    20
mode(3.1415)                        # Mode of a number
#> [1] "numeric"
mode(c(2.7182, 3.1415))             # Mode of a vector of numbers
#> [1] "numeric"
mode("Moe")                         # Mode of a character string
#> [1] "character"
mode(list("Moe", "Larry", "Curly")) # Mode of a list
#> [1] "list"
d <- as.Date("2010-03-15")
mode(d)
#> [1] "numeric"
length(d)
#> [1] 1
class(d)
#> [1] "Date"
pi
#> [1] 3.14
length(pi)
#> [1] 1
pi[1]
#> [1] 3.14
pi[2]
#> [1] NA
A <- 1:6
dim(A)
#> NULL
print(A)
#> [1] 1 2 3 4 5 6
dim(A) <- c(2, 3)
print(A)
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6
B <- list(1, 2, 3, 4, 5, 6)
dim(B)
#> NULL
dim(B) <- c(2, 3)
print(B)
#>      [,1] [,2] [,3]
#> [1,] 1    3    5
#> [2,] 2    4    6
D <- 1:12
dim(D) <- c(2, 3, 2)
print(D)
#> , , 1
#>
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6
#>
#> , , 2
#>
#>      [,1] [,2] [,3]
#> [1,]    7    9   11
#> [2,]    8   10   12
C <- list(1, 2, 3, "X", "Y", "Z")
dim(C) <- c(2, 3)
print(C)
#>      [,1] [,2] [,3]
#> [1,] 1    3    "Y"
#> [2,] 2    "X"  "Z"
v <- c(1, 2, 3)
newItems <- c(6, 7, 8)
v <- c(v, newItems)
v
#> [1] 1 2 3 6 7 8
v[length(v) + 1] <- 42
v
#> [1]  1  2  3  6  7  8 42
v <- c(1, 2, 3)
v <- c(v, 4) # Append a single value to v
v
#> [1] 1 2 3 4

w <- c(5, 6, 7, 8)
v <- c(v, w) # Append an entire vector to v
v
#> [1] 1 2 3 4 5 6 7 8
v <- c(1, 2, 3) # Create a vector of three elements
v[10] <- 10 # Assign to the 10th element
v # R extends the vector automatically
#>  [1]  1  2  3 NA NA NA NA NA NA 10
v
#>  [1]  1  2  3 NA NA NA NA NA NA 10
newvalues <- c(100, 101)
n <- 2
append(v, newvalues, after = n)
#>  [1]   1   2 100 101   3  NA  NA  NA  NA  NA  NA  10
append(1:10, 99, after = 5)
#>  [1]  1  2  3  4  5 99  6  7  8  9 10
append(1:10, 99, after = 0)
#>  [1] 99  1  2  3  4  5  6  7  8  9 10
   1:6   1:3
  ----- -----
    1     1
    2     2
    3     3
    4
    5
    6
   1:6   1:3   (1:6) + (1:3)
  ----- ----- ---------------
    1     1         2
    2     2         4
    3     3         6
    4               5
    5               7
    6               9
(1:6) + (1:3)
#> [1] 2 4 6 5 7 9
r}
cbind(1:6)

cbind(1:3)
cbind(1:6, 1:3)
#>      [,1] [,2]
#> [1,]    1    1
#> [2,]    2    2
#> [3,]    3    3
#> [4,]    4    1
#> [5,]    5    2
#> [6,]    6    3
(1:6) + (1:5) # Oops! 1:5 is one element too short
#> Warning in (1:6) + (1:5): longer object length is not a multiple of shorter
#> object length
#> [1]  2  4  6  8 10  7
(1:6) + 10
#> [1] 11 12 13 14 15 16
v <- c("dog", "cat", "mouse", "rat", "dog")
f <- factor(v) # v can be a vector of strings or integers
f
#> [1] dog   cat   mouse rat   dog
#> Levels: cat dog mouse rat
str(f)
#>  Factor w/ 4 levels "cat","dog","mouse",..: 2 1 3 4 2
v <- c("dog", "cat", "mouse", "rat", "dog")
f <- factor(v, levels = c("dog", "cat", "mouse", "rat", "horse"))
f
#> [1] dog   cat   mouse rat   dog
#> Levels: dog cat mouse rat horse
str(f)
#>  Factor w/ 5 levels "dog","cat","mouse",..: 1 2 3 4 1
f <- factor(c("Win", "Win", "Lose", "Tie", "Win", "Lose"))
f
#> [1] Win  Win  Lose Tie  Win  Lose
#> Levels: Lose Tie Win
wday <- c("Wed", "Thu", "Mon", "Wed", "Thu", "Thu", "Thu", "Tue", "Thu", "Tue")
f <- factor(wday)
f
#>  [1] Wed Thu Mon Wed Thu Thu Thu Tue Thu Tue
#> Levels: Mon Thu Tue Wed
f <- factor(wday, c("Mon", "Tue", "Wed", "Thu", "Fri"))
f
#>  [1] Wed Thu Mon Wed Thu Thu Thu Tue Thu Tue
#> Levels: Mon Tue Wed Thu Fri
v1 <- c(1, 2, 3)
v2 <- c(4, 5, 6)
v3 <- c(7, 8, 9)
comb <- stack(list(v1 = v1, v2 = v2, v3 = v3)) # Combine 3 vectors
comb
#>   values ind
#> 1      1  v1
#> 2      2  v1
#> 3      3  v1
#> 4      4  v2
#> 5      5  v2
#> 6      6  v2
#> 7      7  v3
#> 8      8  v3
#> 9      9  v3
set.seed(2)
n <- 5
freshmen <- sample(1:5, n, replace = TRUE, prob = c(.6, .2, .1, .05, .05))
sophomores <- sample(1:5, n, replace = TRUE, prob = c(.05, .2, .6, .1, .05))
juniors <- sample(1:5, n, replace = TRUE, prob = c(.05, .2, .55, .15, .05))

comb <- stack(list(fresh = freshmen, soph = sophomores, jrs = juniors))
print(comb)
#>    values   ind
#> 1       1 fresh
#> 2       2 fresh
#> 3       1 fresh
#> 4       1 fresh
#> 5       5 fresh
#> 6       5  soph
#> 7       3  soph
#> 8       4  soph
#> 9       3  soph
#> 10      3  soph
#> 11      2   jrs
#> 12      3   jrs
#> 13      4   jrs
#> 14      3   jrs
#> 15      3   jrs
aov(values ~ ind, data = comb)
#> Call:
#>    aov(formula = values ~ ind, data = comb)
#>
#> Terms:
#>                   ind Residuals
#> Sum of Squares   6.53     17.20
#> Deg. of Freedom     2        12
#>
#> Residual standard error: 1.2
#> Estimated effects may be unbalanced
x <- c("a", "b", "c")
y <- c(1, 2, 3)
z <- "why be normal?"
lst <- list(x, y, z)
lst
#> [[1]]
#> [1] "a" "b" "c"
#>
#> [[2]]
#> [1] 1 2 3
#>
#> [[3]]
#> [1] "why be normal?"
lst <- list(0.5, 0.841, 0.977)
lst
#> [[1]]
#> [1] 0.5
#>
#> [[2]]
#> [1] 0.841
#>
#> [[3]]
#> [1] 0.977
lst <- list(3.14, "Moe", c(1, 1, 2, 3), mean)
lst
#> [[1]]
#> [1] 3.14
#>
#> [[2]]
#> [1] "Moe"
#>
#> [[3]]
#> [1] 1 1 2 3
#>
#> [[4]]
#> function (x, ...)
#> UseMethod("mean")
#> <bytecode: 0x7f8f0457ff88>
#> <environment: namespace:base>
lst <- list()
lst[[1]] <- 3.14
lst[[2]] <- "Moe"
lst[[3]] <- c(1, 1, 2, 3)
lst[[4]] <- mean
lst
#> [[1]]
#> [1] 3.14
#>
#> [[2]]
#> [1] "Moe"
#>
#> [[3]]
#> [1] 1 1 2 3
#>
#> [[4]]
#> function (x, ...)
#> UseMethod("mean")
#> <bytecode: 0x7f8f0457ff88>
#> <environment: namespace:base>
lst <- list(mid = 0.5, right = 0.841, far.right = 0.977)
lst
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
years <- list(1960, 1964, 1976, 1994)
years
#> [[1]]
#> [1] 1960
#>
#> [[2]]
#> [1] 1964
#>
#> [[3]]
#> [1] 1976
#>
#> [[4]]
#> [1] 1994
years[[1]]
years[c(1, 2)]
#> [[1]]
#> [1] 1960
#>
#> [[2]]
#> [1] 1964
class(years[[1]])
#> [1] "numeric"

class(years[1])
#> [1] "list"
cat(years[[1]], "\n")
#> 1960

cat(years[1], "\n")
#> Error in cat(years[1], "\n"): argument 1 (type 'list') cannot be handled by 'cat'
years <- list(Kennedy = 1960, Johnson = 1964, Carter = 1976, Clinton = 1994)
years[["Kennedy"]]
#> [1] 1960
years$Kennedy
#> [1] 1960
years[c("Kennedy", "Johnson")]
#> $Kennedy
#> [1] 1960
#>
#> $Johnson
#> [1] 1964

years["Carter"]
#> $Carter
#> [1] 1976
lst <- list(mid = 0.5, right = 0.841, far.right = 0.977)
lst
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
values <- c(1, 2, 3)
names <- c("a", "b", "c")
lst <- list()
lst[names] <- values
lst
#> $a
#> [1] 1
#>
#> $b
#> [1] 2
#>
#> $c
#> [1] 3
lst <- list(
  far.left = 0.023,
  left = 0.159,
  mid = 0.500,
  right = 0.841,
  far.right = 0.977
)
lst
#> $far.left
#> [1] 0.023
#>
#> $left
#> [1] 0.159
#>
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
lst <- list()
lst$far.left <- 0.023
lst$left <- 0.159
lst$mid <- 0.500
lst$right <- 0.841
lst$far.right <- 0.977
lst
#> $far.left
#> [1] 0.023
#>
#> $left
#> [1] 0.159
#>
#> $mid
#> [1] 0.5
#>
#> $right
#> [1] 0.841
#>
#> $far.right
#> [1] 0.977
values <- pnorm(-2:2)
names <- c("far.left", "left", "mid", "right", "far.right")
lst <- list()
lst[names] <- values
cat("The left limit is", lst[["left"]], "\n")
#> The left limit is 0.159
cat("The right limit is", lst[["right"]], "\n")
#> The right limit is 0.841

for (nm in names(lst)) cat("The", nm, "limit is", lst[[nm]], "\n")
#> The far.left limit is 0.0228
#> The left limit is 0.159
#> The mid limit is 0.5
#> The right limit is 0.841
#> The far.right limit is 0.977
years <- list(Kennedy = 1960, Johnson = 1964, Carter = 1976, Clinton = 1994)
years
#> $Kennedy
#> [1] 1960
#>
#> $Johnson
#> [1] 1964
#>
#> $Carter
#> [1] 1976
#>
#> $Clinton
#> [1] 1994
years[["Johnson"]] <- NULL # Remove the element labeled "Johnson"
years
#> $Kennedy
#> [1] 1960
#>
#> $Carter
#> [1] 1976
#>
#> $Clinton
#> [1] 1994
years[c("Carter", "Clinton")] <- NULL # Remove two elements
years
#> $Kennedy
#> [1] 1960
iq.scores <- list(rnorm(5, 100, 15))
iq.scores
#> [[1]]
#> [1] 115.8  88.7  78.4  95.7  84.5
mean(iq.scores)
#> Warning in mean.default(iq.scores): argument is not numeric or logical:
#> returning NA
#> [1] NA
mean(unlist(iq.scores))
#> [1] 92.6
cat(iq.scores, "\n")
#> Error in cat(iq.scores, "\n"): argument 1 (type 'list') cannot be handled by 'cat'
cat("IQ Scores:", unlist(iq.scores), "\n")
#> IQ Scores: 116 88.7 78.4 95.7 84.5
lst <- list(1, NULL, 2, 3, NULL, 4)
lst
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> [1] 2
#>
#> [[4]]
#> [1] 3
#>
#> [[5]]
#> NULL
#>
#> [[6]]
#> [1] 4
lst[sapply(lst, is.null)] <- NULL
lst
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> [1] 2
#>
#> [[3]]
#> [1] 3
#>
#> [[4]]
#> [1] 4
lst <- list("Moe", NULL, "Curly") # Create list with NULL element
lst
#> [[1]]
#> [1] "Moe"
#>
#> [[2]]
#> NULL
#>
#> [[3]]
#> [1] "Curly"

lst[sapply(lst, is.null)] <- NULL # Remove NULL element from list
lst
#> [[1]]
#> [1] "Moe"
#>
#> [[2]]
#> [1] "Curly"
lst <- as.list(rnorm(7))
lst
#> [[1]]
#> [1] -0.0281
#>
#> [[2]]
#> [1] -0.366
#>
#> [[3]]
#> [1] -1.12
#>
#> [[4]]
#> [1] -0.976
#>
#> [[5]]
#> [1] 1.12
#>
#> [[6]]
#> [1] 0.324
#>
#> [[7]]
#> [1] -0.568

lst[lst < 0] <- NULL
lst
#> [[1]]
#> [1] 1.12
#>
#> [[2]]
#> [1] 0.324
list(rnorm(7))
#> [[1]]
#> [1] -1.034 -0.533 -0.981  0.823 -0.388  0.879 -2.178
lst[lst == 0] <- NULL
lst[is.na(lst)] <- NULL
lst[abs(lst) < 1] <- NULL
#> Error in abs(lst): non-numeric argument to mathematical function
lst
#> [[1]]
#> [1] 1.12
#>
#> [[2]]
#> [1] 0.324
lst[abs(unlist(lst)) < 1] <- NULL
lst
#> [[1]]
#> [1] 1.12
lst <- as.list(rnorm(5))
lst
#> [[1]]
#> [1] 1.47
#>
#> [[2]]
#> [1] 0.885
#>
#> [[3]]
#> [1] 2.29
#>
#> [[4]]
#> [1] 0.554
#>
#> [[5]]
#> [1] 1.21
lst[lapply(lst, abs) < 1] <- NULL
lst
#> [[1]]
#> [1] 1.47
#>
#> [[2]]
#> [1] 2.29
#>
#> [[3]]
#> [1] 1.21
x <- 1:10
y1 <- 2 * x + rnorm(10, 0, 1)
y2 <- 3 * x + rnorm(10, 0, 8)

result_list <- list(lm(x ~ y1), lm(x ~ y2))

result_list[sapply(result_list, function(m) summary(m)$r.squared < 0.7)] <- NULL
sapply(result_list, function(m) summary(m)$r.squared)
#> [1] 0.990 0.708
lapply(result_list, function(m) summary(m)$r.squared)
#> [[1]]
#> [1] 0.99
#>
#> [[2]]
#> [1] 0.708
vec <- 1:6
matrix(vec, 2, 3)
#>      [,1] [,2] [,3]
#> [1,]    1    3    5
#> [2,]    2    4    6
matrix(0, 2, 3) # Create an all-zeros matrix
#>      [,1] [,2] [,3]
#> [1,]    0    0    0
#> [2,]    0    0    0

matrix(NA, 2, 3) # Create a matrix populated with NA
#>      [,1] [,2] [,3]
#> [1,]   NA   NA   NA
#> [2,]   NA   NA   NA
mat <- matrix(c(1.1, 1.2, 1.3, 2.1, 2.2, 2.3), 2, 3)
mat
#>      [,1] [,2] [,3]
#> [1,]  1.1  1.3  2.2
#> [2,]  1.2  2.1  2.3
theData <- c(
  1.1, 1.2, 1.3,
  2.1, 2.2, 2.3
)
mat <- matrix(theData, 2, 3, byrow = TRUE)
mat
#>      [,1] [,2] [,3]
#> [1,]  1.1  1.2  1.3
#> [2,]  2.1  2.2  2.3
mat <- matrix(c(
  1.1, 1.2, 1.3,
  2.1, 2.2, 2.3
),
2, 3,
byrow = TRUE
)
v <- c(1.1, 1.2, 1.3, 2.1, 2.2, 2.3)
dim(v) <- c(2, 3)
v
#>      [,1] [,2] [,3]
#> [1,]  1.1  1.3  2.2
#> [2,]  1.2  2.1  2.3
theData <- c(
  1.1, 1.2, 1.3,
  2.1, 2.2, 2.3,
  3.1, 3.2, 3.3
)
mat <- matrix(theData, 3, 3, byrow = TRUE)

rownames(mat) <- c("rowname1", "rowname2", "rowname3")
colnames(mat) <- c("colname1", "colname2", "colname3")
mat
#>          colname1 colname2 colname3
#> rowname1      1.1      1.2      1.3
#> rowname2      2.1      2.2      2.3
#> rowname3      3.1      3.2      3.3
library("quantmod")
#> Loading required package: xts
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#>     as.Date, as.Date.numeric
#>
#> Attaching package: 'xts'
#> The following objects are masked from 'package:dplyr':
#>
#>     first, last
#> Loading required package: TTR
#> Version 0.4-0 included new data defaults. See ?getSymbols.

getSymbols(c("AAPL", "MSFT", "GOOG"), auto.assign = TRUE)
#> 'getSymbols' currently uses auto.assign=TRUE by default, but will
#> use auto.assign=FALSE in 0.5-0. You will still be able to use
#> 'loadSymbols' to automatically load data. getOption("getSymbols.env")
#> and getOption("getSymbols.auto.assign") will still be checked for
#> alternate defaults.
#>
#> This message is shown once per session and may be disabled by setting
#> options("getSymbols.warning4.0"=FALSE). See ?getSymbols for details.
#>
#> WARNING: There have been significant changes to Yahoo Finance data.
#> Please see the Warning section of '?getSymbols.yahoo' for details.
#>
#> This message is shown once per session and may be disabled by setting
#> options("getSymbols.yahoo.warning"=FALSE).
#> [1] "AAPL" "MSFT" "GOOG"
cor_mat <- cor(cbind(
  periodReturn(AAPL, period = "daily", subset = "2017"),
  periodReturn(MSFT, period = "daily", subset = "2017"),
  periodReturn(GOOG, period = "daily", subset = "2017")
))
cor_mat
#>                 daily.returns daily.returns.1 daily.returns.2
#> daily.returns           1.000           0.438           0.489
#> daily.returns.1         0.438           1.000           0.619
#> daily.returns.2         0.489           0.619           1.000
colnames(cor_mat) <- c("AAPL", "MSFT", "GOOG")
rownames(cor_mat) <- c("AAPL", "MSFT", "GOOG")
cor_mat
#>       AAPL  MSFT  GOOG
#> AAPL 1.000 0.438 0.489
#> MSFT 0.438 1.000 0.619
#> GOOG 0.489 0.619 1.000
cor_mat["MSFT", "GOOG"] # What is the correlation between MSFT and GOOG?
#> [1] 0.619
mat[1, ] # First row
#> colname1 colname2 colname3
#>      1.1      1.2      1.3
mat[, 3] # Third column
#> rowname1 rowname2 rowname3
#>      1.3      2.3      3.3
mat[1, , drop = FALSE] # First row in a one-row matrix
#>          colname1 colname2 colname3
#> rowname1      1.1      1.2      1.3
mat[, 3, drop = FALSE] # Third column in a one-column matrix
#>          colname3
#> rowname1      1.3
#> rowname2      2.3
#> rowname3      3.3
mat[1, ]
#> colname1 colname2 colname3
#>      1.1      1.2      1.3

mat[, 3]
#> rowname1 rowname2 rowname3
#>      1.3      2.3      3.3
mat[1, , drop = FALSE]
#>          colname1 colname2 colname3
#> rowname1      1.1      1.2      1.3
mat[, 3, drop = FALSE]
#>          colname3
#> rowname1      1.3
#> rowname2      2.3
#> rowname3      3.3
v1 <- 1:5
v2 <- 6:10
v3 <- c("A", "B", "C", "D", "E")
f1 <- factor(c("a", "a", "a", "b", "b"))
df <- data.frame(v1, v2, v3, f1)
df
#>   v1 v2 v3 f1
#> 1  1  6  A  a
#> 2  2  7  B  a
#> 3  3  8  C  a
#> 4  4  9  D  b
#> 5  5 10  E  b
list.of.vectors <- list(v1 = v1, v2 = v2, v3 = v3, f1 = f1)
df2 <- as.data.frame(list.of.vectors)
df2
#>   v1 v2 v3 f1
#> 1  1  6  A  a
#> 2  2  7  B  a
#> 3  3  8  C  a
#> 4  4  9  D  b
#> 5  5 10  E  b
pred1 <- rnorm(10)
pred2 <- rnorm(10, 1, 2)
pred3 <- sample(c("AM", "PM"), 10, replace = TRUE)
resp <- 2.1 + pred1 * .3 + pred2 * .9
df <- data.frame(pred1, pred2, pred3, resp)
df
#>     pred1   pred2 pred3 resp
#> 1  -0.117 -0.0196    AM 2.05
#> 2  -1.133  0.1529    AM 1.90
#> 3   0.632  3.8004    AM 5.71
#> 4   0.188  4.5922    AM 6.29
#> 5   0.892  1.8556    AM 4.04
#> 6  -1.224  2.8140    PM 4.27
#> 7   0.174  0.4908    AM 2.59
#> 8  -0.689 -0.1335    PM 1.77
#> 9   1.204 -0.0482    AM 2.42
#> 10  0.697  2.2268    PM 4.31
df <- data.frame(p1 = pred1, p2 = pred2, p3 = pred3, r = resp)
head(df, 3)
#>       p1      p2 p3    r
#> 1 -0.117 -0.0196 AM 2.05
#> 2 -1.133  0.1529 AM 1.90
#> 3  0.632  3.8004 AM 5.71
tib <- as_tibble(list(p1 = pred1, p2 = pred2, p3 = pred3, r = resp))
tib
#> # A tibble: 10 x 4
#>       p1      p2 p3        r
#>    <dbl>   <dbl> <chr> <dbl>
#> 1 -0.117 -0.0196 AM     2.05
#> 2 -1.13   0.153  AM     1.90
#> 3  0.632  3.80   AM     5.71
#> 4  0.188  4.59   AM     6.29
#> 5  0.892  1.86   AM     4.04
#> 6 -1.22   2.81   PM     4.27
#> # ... with 4 more rows
r1 <- data.frame(a = 1, b = 2, c = "a")
r2 <- data.frame(a = 3, b = 4, c = "b")
r3 <- data.frame(a = 5, b = 6, c = "c")
obs <- list(r1, r2, r3)
df <- do.call(rbind, obs)
df
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
#> 3 5 6 c
rbind(obs[[1]], obs[[2]])
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
do.call(rbind, obs)
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
#> 3 5 6 c
l1 <- list(a = 1, b = 2, c = "a")
l2 <- list(a = 3, b = 4, c = "b")
l3 <- list(a = 5, b = 6, c = "c")
obs <- list(l1, l2, l3)
df <- do.call(rbind, Map(as.data.frame, obs))
df
#>   a b c
#> 1 1 2 a
#> 2 3 4 b
#> 3 5 6 c
r1 <- 1:3
r2 <- 6:8
r3 <- rnorm(3)
obs <- list(r1, r2, r3)
df <- do.call(rbind, obs)
df
#>        [,1]   [,2] [,3]
#> [1,]  1.000  2.000  3.0
#> [2,]  6.000  7.000  8.0
#> [3,] -0.945 -0.547  1.6
data.frame(a = 1, b = 2, c = "a", stringsAsFactors = FALSE)
#>   a b c
#> 1 1 2 a
## same set up as in the previous examples
l1 <- list( a=1, b=2, c='a' )
l2 <- list( a=3, b=4, c='b' )
l3 <- list( a=5, b=6, c='c' )
obs <- list(l1, l2, l3)
df <- do.call(rbind,Map(as.data.frame,obs))
# yes, you could use stringsAsFactors=FALSE above, but we're assuming the data.frame
# came to you with factors already

i <- sapply(df, is.factor)             ## determine which columns are factors
df[i] <- lapply(df[i], as.character)   ## turn only the factors to characters
df
newRow <- data.frame(city = "West Dundee", county = "Kane", state = "IL", pop = 5428)
library(tidyverse)
suburbs <- read_csv("./data/suburbs.txt")
#> Parsed with column specification:
#> cols(
#>   city = col_character(),
#>   county = col_character(),
#>   state = col_character(),
#>   pop = col_double()
#> )

suburbs2 <- rbind(suburbs, newRow)
suburbs2
#> # A tibble: 18 x 4
#>   city    county   state     pop
#>   <chr>   <chr>    <chr>   <dbl>
#> 1 Chicago Cook     IL    2853114
#> 2 Kenosha Kenosha  WI      90352
#> 3 Aurora  Kane     IL     171782
#> 4 Elgin   Kane     IL      94487
#> 5 Gary    Lake(IN) IN     102746
#> 6 Joliet  Kendall  IL     106221
#> # ... with 12 more rows
suburbs3 <- rbind(suburbs, data.frame(city = "West Dundee", county = "Kane", state = "IL", pop = 5428))
suburbs4 <- rbind(
  suburbs,
  data.frame(city = "West Dundee", county = "Kane", state = "IL", pop = 5428),
  data.frame(city = "East Dundee", county = "Kane", state = "IL", pop = 2955)
)
str(suburbs)
#> Classes 'tbl_df', 'tbl' and 'data.frame':    17 obs. of  4 variables:
#>  $ city  : chr  "Chicago" "Kenosha" "Aurora" "Elgin" ...
#>  $ county: chr  "Cook" "Kenosha" "Kane" "Kane" ...
#>  $ state : chr  "IL" "WI" "IL" "IL" ...
#>  $ pop   : num  2853114 90352 171782 94487 102746 ...
#>  - attr(*, "spec")=
#>   .. cols(
#>   ..   city = col_character(),
#>   ..   county = col_character(),
#>   ..   state = col_character(),
#>   ..   pop = col_double()
#>   .. )
str(newRow)
#> 'data.frame':    1 obs. of  4 variables:
#>  $ city  : Factor w/ 1 level "West Dundee": 1
#>  $ county: Factor w/ 1 level "Kane": 1
#>  $ state : Factor w/ 1 level "IL": 1
#>  $ pop   : num 5428
rbind(some_tibble, some_data.frame)
rbind(some_data.frame, some_tibble)
n <- 5
df <- data.frame(colname1 = numeric(n), colname2 = character(n))
n <- 1000000
df <- data.frame(
  dosage = numeric(n),
  lab = character(n),
  response = numeric(n),
  stringsAsFactors = FALSE
)
str(df)
#> 'data.frame':    1000000 obs. of  3 variables:
#>  $ dosage  : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ lab     : chr  "" "" "" "" ...
#>  $ response: num  0 0 0 0 0 0 0 0 0 0 ...
n <- 1000000
df <- data.frame(
  dosage = numeric(n),
  lab = factor(n, levels = c("NJ", "IL", "CA")),
  response = numeric(n)
)
str(df)
#> 'data.frame':    1000000 obs. of  3 variables:
#>  $ dosage  : num  0 0 0 0 0 0 0 0 0 0 ...
#>  $ lab     : Factor w/ 3 levels "NJ","IL","CA": NA NA NA NA NA NA NA NA NA NA ...
#>  $ response: num  0 0 0 0 0 0 0 0 0 0 ...
suburbs <- read_csv("./data/suburbs.txt")
#> Parsed with column specification:
#> cols(
#>   city = col_character(),
#>   county = col_character(),
#>   state = col_character(),
#>   pop = col_double()
#> )
suburbs
#> # A tibble: 17 x 4
#>   city    county   state     pop
#>   <chr>   <chr>    <chr>   <dbl>
#> 1 Chicago Cook     IL    2853114
#> 2 Kenosha Kenosha  WI      90352
#> 3 Aurora  Kane     IL     171782
#> 4 Elgin   Kane     IL      94487
#> 5 Gary    Lake(IN) IN     102746
#> 6 Joliet  Kendall  IL     106221
#> # ... with 11 more rows
suburbs[[1]]
#>  [1] "Chicago"           "Kenosha"           "Aurora"
#>  [4] "Elgin"             "Gary"              "Joliet"
#>  [7] "Naperville"        "Arlington Heights" "Bolingbrook"
#> [10] "Cicero"            "Evanston"          "Hammond"
#> [13] "Palatine"          "Schaumburg"        "Skokie"
#> [16] "Waukegan"          "West Dundee"
suburbs[1]
#> # A tibble: 17 x 1
#>   city
#>   <chr>
#> 1 Chicago
#> 2 Kenosha
#> 3 Aurora
#> 4 Elgin
#> 5 Gary
#> 6 Joliet
#> # ... with 11 more rows
suburbs %>%
  dplyr::select(1)
#> # A tibble: 17 x 1
#>   city
#>   <chr>
#> 1 Chicago
#> 2 Kenosha
#> 3 Aurora
#> 4 Elgin
#> 5 Gary
#> 6 Joliet
#> # ... with 11 more rows
suburbs %>%
  dplyr::select(1, 4)
#> # A tibble: 17 x 2
#>   city        pop
#>   <chr>     <dbl>
#> 1 Chicago 2853114
#> 2 Kenosha   90352
#> 3 Aurora   171782
#> 4 Elgin     94487
#> 5 Gary     102746
#> 6 Joliet   106221
#> # ... with 11 more rows
suburbs[c(1, 3)]
#> # A tibble: 17 x 2
#>   city    state
#>   <chr>   <chr>
#> 1 Chicago IL
#> 2 Kenosha WI
#> 3 Aurora  IL
#> 4 Elgin   IL
#> 5 Gary    IN
#> 6 Joliet  IL
#> # ... with 11 more rows
suburbs[, 1]
#> # A tibble: 17 x 1
#>   city
#>   <chr>
#> 1 Chicago
#> 2 Kenosha
#> 3 Aurora
#> 4 Elgin
#> 5 Gary
#> 6 Joliet
#> # ... with 11 more rows
suburbs[, c(1, 4)]
#> # A tibble: 17 x 2
#>   city        pop
#>   <chr>     <dbl>
#> 1 Chicago 2853114
#> 2 Kenosha   90352
#> 3 Aurora   171782
#> 4 Elgin     94487
#> 5 Gary     102746
#> 6 Joliet   106221
#> # ... with 11 more rows
df[, vec]
df[, vec, drop = FALSE]
subset(df, select = colname)
subset(df, select = c(colname1, ..., colnameN))
subset(suburbs, subset = (pop > 100000))
#> # A tibble: 5 x 4
#>   city       county   state     pop
#>   <chr>      <chr>    <chr>   <dbl>
#> 1 Chicago    Cook     IL    2853114
#> 2 Aurora     Kane     IL     171782
#> 3 Gary       Lake(IN) IN     102746
#> 4 Joliet     Kendall  IL     106221
#> 5 Naperville DuPage   IL     147779
subset(suburbs, select = c(city, state, pop), subset = (pop > 100000))
#> # A tibble: 5 x 3
#>   city       state     pop
#>   <chr>      <chr>   <dbl>
#> 1 Chicago    IL    2853114
#> 2 Aurora     IL     171782
#> 3 Gary       IN     102746
#> 4 Joliet     IL     106221
#> 5 Naperville IL     147779
suburbs %>%
  dplyr::select(city, state, pop) %>%
  filter(pop > 100000)
#> # A tibble: 5 x 3
#>   city       state     pop
#>   <chr>      <chr>   <dbl>
#> 1 Chicago    IL    2853114
#> 2 Aurora     IL     171782
#> 3 Gary       IN     102746
#> 4 Joliet     IL     106221
#> 5 Naperville IL     147779
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#>     select
my_subset <- subset(Cars93, select = Model, subset = (MPG.city > 30))
head(my_subset)
#>      Model
#> 31 Festiva
#> 39   Metro
#> 42   Civic
#> 73  LeMans
#> 80   Justy
#> 83   Swift
Cars93 %>%
  filter(MPG.city > 30) %>%
  select(Model) %>%
  head()
#> Error in select(., Model): unused argument (Model)
Cars93 %>%
  filter(MPG.city > 30) %>%
  dplyr::select(Model) %>%
  head()
#>     Model
#> 1 Festiva
#> 2   Metro
#> 3   Civic
#> 4  LeMans
#> 5   Justy
#> 6   Swift
my_cars <- subset(Cars93,
  select = c(Model, Min.Price, Max.Price),
  subset = (Cylinders == 4 & Origin == "USA")
)
head(my_cars)
#>       Model Min.Price Max.Price
#> 6   Century      14.2      17.3
#> 12 Cavalier       8.5      18.3
#> 13  Corsica      11.4      11.4
#> 15   Lumina      13.4      18.4
#> 21  LeBaron      14.5      17.1
#> 23     Colt       7.9      10.6
Cars93 %>%
  filter(Cylinders == 4 & Origin == "USA") %>%
  dplyr::select(Model, Min.Price, Max.Price) %>%
  head()
#>      Model Min.Price Max.Price
#> 1  Century      14.2      17.3
#> 2 Cavalier       8.5      18.3
#> 3  Corsica      11.4      11.4
#> 4   Lumina      13.4      18.4
#> 5  LeBaron      14.5      17.1
#> 6     Colt       7.9      10.6
my_cars <- subset(Cars93,
  select = c(Manufacturer, Model),
  subset = c(MPG.highway > median(MPG.highway))
)
head(my_cars)
#>    Manufacturer    Model
#> 1         Acura  Integra
#> 5           BMW     535i
#> 6         Buick  Century
#> 12    Chevrolet Cavalier
#> 13    Chevrolet  Corsica
#> 15    Chevrolet   Lumina
Cars93 %>%
  filter(MPG.highway > median(MPG.highway)) %>%
  dplyr::select(Manufacturer, Model) %>%
  head()
#>   Manufacturer    Model
#> 1        Acura  Integra
#> 2          BMW     535i
#> 3        Buick  Century
#> 4    Chevrolet Cavalier
#> 5    Chevrolet  Corsica
#> 6    Chevrolet   Lumina
detach("package:MASS", unload = TRUE)
df <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
df
#>   V1 V2 V3
#> 1  1  4  7
#> 2  2  5  8
#> 3  3  6  9
colnames(df) <- c("tom", "dick", "harry") # a vector of character strings
df
#>   tom dick harry
#> 1   1    4     7
#> 2   2    5     8
#> 3   3    6     9
df <- data.frame(V1 = 1:3, V2 = 4:6, V3 = 7:9)
df %>%
  rename(tom = V1, dick = V2, harry = V3)
#>   tom dick harry
#> 1   1    4     7
#> 2   2    5     8
#> 3   3    6     9
mat <- matrix(rnorm(9), nrow = 3, ncol = 3)
mat
#>       [,1]    [,2]   [,3]
#> [1,] 0.701  0.0976  0.821
#> [2,] 0.388 -1.2755 -1.086
#> [3,] 1.968  1.2544  0.111
as.data.frame(mat)
#>      V1      V2     V3
#> 1 0.701  0.0976  0.821
#> 2 0.388 -1.2755 -1.086
#> 3 1.968  1.2544  0.111
lst <- list(1:3, c("a", "b", "c"), round(rnorm(3), 3))
lst
#> [[1]]
#> [1] 1 2 3
#>
#> [[2]]
#> [1] "a" "b" "c"
#>
#> [[3]]
#> [1] 0.181 0.773 0.983
as.data.frame(lst)
#>   X1.3 c..a....b....c.. c.0.181..0.773..0.983.
#> 1    1                a                  0.181
#> 2    2                b                  0.773
#> 3    3                c                  0.983
df <- as.data.frame(lst)
colnames(df) <- c("patient", "treatment", "value")
df
#>   patient treatment value
#> 1       1         a 0.181
#> 2       2         b 0.773
#> 3       3         c 0.983
df <- as.data.frame(lst)
df %>%
  rename(
    "patient" = !!names(.[1]),
    "treatment" = !!names(.[2]),
    "value" = !!names(.[3])
  )
#>   patient treatment value
#> 1       1         a 0.181
#> 2       2         b 0.773
#> 3       3         c 0.983
names(lst) <- c("patient", "treatment", "value")
as.data.frame(lst)
#>   patient treatment value
#> 1       1         a 0.181
#> 2       2         b 0.773
#> 3       3         c 0.983
df <- data.frame(my_data = c(NA, 1, NA, 2, NA, 3))
df
#>   my_data
#> 1      NA
#> 2       1
#> 3      NA
#> 4       2
#> 5      NA
#> 6       3
clean_df <- na.omit(df)
clean_df
#>   my_data
#> 2       1
#> 4       2
#> 6       3
df <- data.frame(
  x = c(NA, rnorm(4)),
  y = c(rnorm(2), NA, rnorm(2))
)
df
#>        x      y
#> 1     NA -0.836
#> 2  0.670 -0.922
#> 3 -1.421     NA
#> 4 -0.236 -1.123
#> 5 -0.975  0.372
cumsum(df)
#>    x      y
#> 1 NA -0.836
#> 2 NA -1.759
#> 3 NA     NA
#> 4 NA     NA
#> 5 NA     NA
cumsum(na.omit(df))
#>        x      y
#> 2  0.670 -0.922
#> 4  0.434 -2.046
#> 5 -0.541 -1.674
df <- data.frame(good = rnorm(3), meh = rnorm(3), bad = rnorm(3))
df
#>     good     meh    bad
#> 1  1.911 -0.7045 -1.575
#> 2  0.912  0.0608 -2.238
#> 3 -0.819  0.4424 -0.807
subset(df, select = -bad) # All columns except bad
#>     good     meh
#> 1  1.911 -0.7045
#> 2  0.912  0.0608
#> 3 -0.819  0.4424
df %>%
  dplyr::select(-bad)
#>     good     meh
#> 1  1.911 -0.7045
#> 2  0.912  0.0608
#> 3 -0.819  0.4424
id <- 1:10
pre <- rnorm(10)
dosage <- rnorm(10) + .3 * pre
post <- dosage * .5 * pre
patient_data <- data.frame(id = id, pre = pre, dosage = dosage, post = post)

cor(patient_data)
#>             id     pre  dosage    post
#> id      1.0000 -0.6934 -0.5075  0.0672
#> pre    -0.6934  1.0000  0.5830 -0.0919
#> dosage -0.5075  0.5830  1.0000  0.0878
#> post    0.0672 -0.0919  0.0878  1.0000
cor(subset(patient_data, select = -id))
#>            pre dosage    post
#> pre     1.0000 0.5830 -0.0919
#> dosage  0.5830 1.0000  0.0878
#> post   -0.0919 0.0878  1.0000
patient_data %>%
  dplyr::select(-id) %>%
  cor()
#>            pre dosage    post
#> pre     1.0000 0.5830 -0.0919
#> dosage  0.5830 1.0000  0.0878
#> post   -0.0919 0.0878  1.0000
## JDL Note... now that I've written all this I think the right thing to do is only show dplyr examples... one way to do things is better... fix in edit
cor(subset(patient_data, select = c(-id, -dosage)))
patient_data %>%
  dplyr::select(-id, -dosage) %>%
  cor()
#>          pre    post
#> pre   1.0000 -0.0919
#> post -0.0919  1.0000
df1 <- data_frame(a = rnorm(5))
df2 <- data_frame(b = rnorm(5))

all <- cbind(df1, df2)
all
#>         a       b
#> 1 -1.6357  1.3669
#> 2 -0.3662 -0.5432
#> 3  0.4445 -0.0158
#> 4  0.4945 -0.6960
#> 5  0.0934 -0.7334
df1 <- data_frame(x = rep("a", 2), y = rnorm(2))
df1
#> # A tibble: 2 x 2
#>   x         y
#>   <chr> <dbl>
#> 1 a     1.90
#> 2 a     0.440

df2 <- data_frame(x = rep("b", 2), y = rnorm(2))
df2
#> # A tibble: 2 x 2
#>   x         y
#>   <chr> <dbl>
#> 1 b     2.35
#> 2 b     0.188

rbind(df1, df2)
#> # A tibble: 4 x 2
#>   x         y
#>   <chr> <dbl>
#> 1 a     1.90
#> 2 a     0.440
#> 3 b     2.35
#> 4 b     0.188
df1 <- data_frame(x = rep("a", 2), y = rnorm(2))
df1
#> # A tibble: 2 x 2
#>   x          y
#>   <chr>  <dbl>
#> 1 a     -0.366
#> 2 a     -0.478

df2 <- data_frame(y = 1:2, x = c("b", "b"))
df2
#> # A tibble: 2 x 2
#>       y x
#>   <int> <chr>
#> 1     1 b
#> 2     2 b

rbind(df1, df2)
#> # A tibble: 4 x 2
#>   x          y
#>   <chr>  <dbl>
#> 1 a     -0.366
#> 2 a     -0.478
#> 3 b      1
#> 4 b      2
df1 <- data.frame(index = letters[1:5], val1 = rnorm(5))
df2 <- data.frame(index = letters[1:5], val2 = rnorm(5))

m <- merge(df1, df2, by = "index")
m
#>   index      val1   val2
#> 1     a -0.000837  1.178
#> 2     b -0.214967 -1.599
#> 3     c -1.399293  0.487
#> 4     d  0.010251 -1.688
#> 5     e -0.031463 -0.149
df1 %>%
  inner_join(df2)
#> Joining, by = "index"
#>   index      val1   val2
#> 1     a -0.000837  1.178
#> 2     b -0.214967 -1.599
#> 3     c -1.399293  0.487
#> 4     d  0.010251 -1.688
#> 5     e -0.031463 -0.149
born <- data.frame(
  name = c("Moe", "Larry", "Curly", "Harry"),
  year.born = c(1887, 1902, 1903, 1964),
  place.born = c("Bensonhurst", "Philadelphia", "Brooklyn", "Moscow")
)
died <- data.frame(
  name = c("Curly", "Moe", "Larry"),
  year.died = c(1952, 1975, 1975)
)
merge(born, died, by = "name")
#>    name year.born   place.born year.died
#> 1 Curly      1903     Brooklyn      1952
#> 2 Larry      1902 Philadelphia      1975
#> 3   Moe      1887  Bensonhurst      1975
born %>%
  inner_join(died)
#> Joining, by = "name"
#> Warning: Column `name` joining factors with different levels, coercing to
#> character vector
#>    name year.born   place.born year.died
#> 1   Moe      1887  Bensonhurst      1975
#> 2 Larry      1902 Philadelphia      1975
#> 3 Curly      1903     Brooklyn      1952
with(dataframe, expr)
z <- (suburbs$pop - mean(suburbs$pop)) / sd(suburbs$pop)
z
#>  [1]  3.875 -0.237 -0.116 -0.231 -0.219 -0.214 -0.152 -0.259 -0.266 -0.264
#> [11] -0.261 -0.248 -0.272 -0.260 -0.277 -0.236 -0.364
z <- with(suburbs, (pop - mean(pop)) / sd(pop))
z
#>  [1]  3.875 -0.237 -0.116 -0.231 -0.219 -0.214 -0.152 -0.259 -0.266 -0.264
#> [11] -0.261 -0.248 -0.272 -0.260 -0.277 -0.236 -0.364
suburbs %>%
  mutate(z = (pop - mean(pop)) / sd(pop))
#> # A tibble: 17 x 5
#>   city    county   state     pop      z
#>   <chr>   <chr>    <chr>   <dbl>  <dbl>
#> 1 Chicago Cook     IL    2853114  3.88
#> 2 Kenosha Kenosha  WI      90352 -0.237
#> 3 Aurora  Kane     IL     171782 -0.116
#> 4 Elgin   Kane     IL      94487 -0.231
#> 5 Gary    Lake(IN) IN     102746 -0.219
#> 6 Joliet  Kendall  IL     106221 -0.214
#> # ... with 11 more rows
as.numeric(" 3.14 ")
#> [1] 3.14
as.integer(3.14)
#> [1] 3
as.numeric("foo")
#> Warning: NAs introduced by coercion
#> [1] NA
as.character(101)
#> [1] "101"
as.numeric(c("1", "2.718", "7.389", "20.086"))
#> [1]  1.00  2.72  7.39 20.09
as.numeric(c("1", "2.718", "7.389", "20.086", "etc."))
#> Warning: NAs introduced by coercion
#> [1]  1.00  2.72  7.39 20.09    NA
as.character(101:105)
#> [1] "101" "102" "103" "104" "105"
as.numeric(FALSE)
#> [1] 0
as.numeric(TRUE)
#> [1] 1
logvec <- c(TRUE, FALSE, TRUE, TRUE, TRUE, FALSE)
sum(logvec) ## num true
#> [1] 4
length(logvec) - sum(logvec) ## num not true
#> [1] 2
library(tidyverse)

lst %>%
  map(fun)
library(tidyverse)

lst <- list(
  a = c(1,2,3),
  b = c(4,5,6)
)
lst %>%
  map(mean)
#> $a
#> [1] 2
#>
#> $b
#> [1] 5
fun <- function(x) {
  if (x > 1) {
    1
  } else {
    "Less Than 1"
  }
}

fun(5)
#> [1] 1
fun(0.5)
#> [1] "Less Than 1"
lst <- list(.5, 1.5, .9, 2)

map(lst, fun)
#> [[1]]
#> [1] "Less Than 1"
#>
#> [[2]]
#> [1] 1
#>
#> [[3]]
#> [1] "Less Than 1"
#>
#> [[4]]
#> [1] 1
map_chr(lst, fun)
#> [1] "Less Than 1" "1.000000"    "Less Than 1" "1.000000"

## or using pipes
lst %>%
  map_chr(fun)
#> [1] "Less Than 1" "1.000000"    "Less Than 1" "1.000000"
map_dbl(lst, fun)
#> Error: Can't coerce element 1 from a character to a double
fun <- function(a, b, c) {
  # calculate the sum of a sequence from a to b by c
  sum(seq(a, b, c))
}

df <- data.frame(mn = c(1, 2, 3),
                 mx = c(8, 13, 18),
                 rng = c(1, 2, 3))

df %>%
  mutate(output =
           pmap_dbl(list(a = mn, b = mx, c = rng), fun))
#>   mn mx rng output
#> 1  1  8   1     36
#> 2  2 13   2     42
#> 3  3 18   3     63
pmap(list(a = df$mn, b = df$mx, c = df$rng), fun)
#> [[1]]
#> [1] 36
#>
#> [[2]]
#> [1] 42
#>
#> [[3]]
#> [1] 63
results <- apply(mat, 1, fun)    # mat is a matrix, fun is a function
long <- matrix(1:15, 3, 5)
long
#>      [,1] [,2] [,3] [,4] [,5]
#> [1,]    1    4    7   10   13
#> [2,]    2    5    8   11   14
#> [3,]    3    6    9   12   15
apply(long, 1, mean)
#> [1] 7 8 9
rownames(long) <- c("Moe", "Larry", "Curly")
apply(long, 1, mean)
#>   Moe Larry Curly
#>     7     8     9
apply(long, 1, range)
#>      Moe Larry Curly
#> [1,]   1     2     3
#> [2,]  13    14    15
apply(mat, 2, fun)
mat <- matrix(c(1, 3, 2, 5, 4, 6), 2, 3)
colnames(mat) <- c("t1", "t2", "t3")
mat
#>      t1 t2 t3
#> [1,]  1  2  4
#> [2,]  3  5  6

apply(mat, 2, mean)  # Compute the mean of every column
#>  t1  t2  t3
#> 2.0 3.5 5.0
df2 <- map_df(df, fun) # Returns a data.frame
load("./data/batches.rdata")
head(batches)
#>   batch clinic dosage shrinkage
#> 1     3     KY     IL    -0.307
#> 2     3     IL     IL    -1.781
#> 3     1     KY     IL    -0.172
#> 4     3     KY     IL     1.215
#> 5     2     IL     IL     1.895
#> 6     2     NJ     IL    -0.430
map_df(batches, class)
#> # A tibble: 1 x 4
#>   batch  clinic dosage shrinkage
#>   <chr>  <chr>  <chr>  <chr>
#> 1 factor factor factor numeric
lst <- list(v1, v2, v3)
pmap(lst, fun)
map2(v1, v2, fun)
map2_dbl(v1, v2, fun)
gcd <- function(a, b) {
  if (b == 0) {
    return(a)
  } else {
    return(gcd(b, a %% b))
  }
}
gcd(c(1, 2, 3), c(9, 6, 3))
#> Warning in if (b == 0) {: the condition has length > 1 and only the first
#> element will be used

#> Warning in if (b == 0) {: the condition has length > 1 and only the first
#> element will be used

#> Warning in if (b == 0) {: the condition has length > 1 and only the first
#> element will be used
#> [1] 1 2 0
a <- c(1, 2, 3)
b <- c(9, 6, 3)
my_gcds <- map2(a, b, gcd)
my_gcds
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> [1] 2
#>
#> [[3]]
#> [1] 3
unlist(my_gcds)
#> [1] 1 2 3
map2_chr(a, b, gcd)
#> [1] "1.000000" "2.000000" "3.000000"
map2_dbl(a, b, gcd)
#> [1] 1 2 3
lst <- list(a,b)
pmap(lst, gcd)
#> [[1]]
#> [1] 1
#>
#> [[2]]
#> [1] 2
#>
#> [[3]]
#> [1] 3
lst <- list(a,b)
pmap_dbl(lst, gcd)
#> [1] 1 2 3
df %>%
  group_by(v1, v2) %>%
  summarize(
    result_var = fun(value_var)
  )
df <- tibble(
  my_group = c("A", "B","A", "B","A", "B"),
  values = 1:6
)

df %>%
  group_by(my_group) %>%
  summarize(
    avg_values = mean(values),
    tot_values = sum(values),
    count_values = n()
  )
#> # A tibble: 2 x 4
#>   my_group avg_values tot_values count_values
#>   <chr>         <dbl>      <int>        <int>
#> 1 A                 3          9            3
#> 2 B                 4         12            3
nchar("Moe")
#> [1] 3
nchar("Curly")
#> [1] 5
s <- c("Moe", "Larry", "Curly")
nchar(s)
#> [1] 3 5 5
length("Moe")
#> [1] 1
length(c("Moe", "Larry", "Curly"))
#> [1] 3
paste("Everybody", "loves", "stats.")
#> [1] "Everybody loves stats."
paste("Everybody", "loves", "stats.", sep = "-")
#> [1] "Everybody-loves-stats."
paste("Everybody", "loves", "stats.", sep = "")
#> [1] "Everybodylovesstats."
paste0("Everybody", "loves", "stats.")
#> [1] "Everybodylovesstats."
paste("The square root of twice pi is approximately", sqrt(2 * pi))
#> [1] "The square root of twice pi is approximately 2.506628274631"
stooges <- c("Moe", "Larry", "Curly")
paste(stooges, "loves", "stats.")
#> [1] "Moe loves stats."   "Larry loves stats." "Curly loves stats."
paste(stooges, "loves", "stats", collapse = ", and ")
#> [1] "Moe loves stats, and Larry loves stats, and Curly loves stats"
substr("Statistics", 1, 4) # Extract first 4 characters
#> [1] "Stat"
substr("Statistics", 7, 10) # Extract last 4 characters
#> [1] "tics"
ss <- c("Moe", "Larry", "Curly")
substr(ss, 1, 3) # Extract first 3 characters of each string
#> [1] "Moe" "Lar" "Cur"
cities <- c("New York, NY", "Los Angeles, CA", "Peoria, IL")
substr(cities, nchar(cities) - 1, nchar(cities))
#> [1] "NY" "CA" "IL"
strsplit(string, delimiter)
path <- "/home/mike/data/trials.csv"
strsplit(path, "/")
#> [[1]]
#> [1] ""           "home"       "mike"       "data"       "trials.csv"
strsplit(path, "/")[[1]]
#> [1] ""           "home"       "mike"       "data"       "trials.csv"
paths <- c(
  "/home/mike/data/trials.csv",
  "/home/mike/data/errors.csv",
  "/home/mike/corr/reject.doc"
)
strsplit(paths, "/")
#> [[1]]
#> [1] ""           "home"       "mike"       "data"       "trials.csv"
#>
#> [[2]]
#> [1] ""           "home"       "mike"       "data"       "errors.csv"
#>
#> [[3]]
#> [1] ""           "home"       "mike"       "corr"       "reject.doc"
sub(old, new, string)
gsub(old, new, string)
str <- "Curly is the smart one. Curly is funny, too."
sub("Curly", "Moe", str)
#> [1] "Moe is the smart one. Curly is funny, too."
gsub("Curly", "Moe", str)
#> [1] "Moe is the smart one. Moe is funny, too."
sub(" and SAS", "", "For really tough problems, you need R and SAS.")
#> [1] "For really tough problems, you need R."
m <- outer(strings1, strings2, paste, sep = "")
locations <- c("NY", "LA", "CHI", "HOU")
treatments <- c("T1", "T2", "T3")
outer(locations, treatments, paste, sep = "-")
#>      [,1]     [,2]     [,3]
#> [1,] "NY-T1"  "NY-T2"  "NY-T3"
#> [2,] "LA-T1"  "LA-T2"  "LA-T3"
#> [3,] "CHI-T1" "CHI-T2" "CHI-T3"
#> [4,] "HOU-T1" "HOU-T2" "HOU-T3"
outer(treatments, treatments, paste, sep = "-")
#>      [,1]    [,2]    [,3]
#> [1,] "T1-T1" "T1-T2" "T1-T3"
#> [2,] "T2-T1" "T2-T2" "T2-T3"
#> [3,] "T3-T1" "T3-T2" "T3-T3"
expand.grid(treatments, treatments)
#>   Var1 Var2
#> 1   T1   T1
#> 2   T2   T1
#> 3   T3   T1
#> 4   T1   T2
#> 5   T2   T2
#> 6   T3   T2
#> 7   T1   T3
#> 8   T2   T3
#> 9   T3   T3
m <- outer(treatments, treatments, paste, sep = "-")
m[!lower.tri(m)]
#> [1] "T1-T1" "T1-T2" "T2-T2" "T1-T3" "T2-T3" "T3-T3"
Sys.Date()
#> [1] "2019-01-07"
class(Sys.Date())
#> [1] "Date"
as.Date("2018-12-31")
#> [1] "2018-12-31"
as.Date("12/31/2018")
#> Error in charToDate(x): character string is not in a standard unambiguous format
as.Date("12/31/2018", format = "%m/%d/%Y")
#> [1] "2018-12-31"
format(Sys.Date())
#> [1] "2019-01-07"
as.character(Sys.Date())
#> [1] "2019-01-07"
format(Sys.Date(), format = "%m/%d/%Y")
#> [1] "01/07/2019"
ISOdate(year, month, day)
year <- 2018
month <- 12
day <- 31
as.Date(ISOdate(year, month, day))
#> [1] "2018-12-31"
ISOdate(2020, 2, 29)
#> [1] "2020-02-29 12:00:00 GMT"
as.Date(ISOdate(2020, 2, 29))
#> [1] "2020-02-29"
ISOdate(2013, 2, 29) # Oops! 2013 is not a leap year
#> [1] NA
years <- 2010:2014
months <- rep(1, 5)
days <- 5:9
ISOdate(years, months, days)
#> [1] "2010-01-05 12:00:00 GMT" "2011-01-06 12:00:00 GMT"
#> [3] "2012-01-07 12:00:00 GMT" "2013-01-08 12:00:00 GMT"
#> [5] "2014-01-09 12:00:00 GMT"
as.Date(ISOdate(years, months, days))
#> [1] "2010-01-05" "2011-01-06" "2012-01-07" "2013-01-08" "2014-01-09"
as.Date(ISOdate(years, 1, days))
#> [1] "2010-01-05" "2011-01-06" "2012-01-07" "2013-01-08" "2014-01-09"
ISOdatetime(year, month, day, hour, minute, second)
d <- as.Date("2019-03-15")
as.integer(d)
#> [1] 17970
jd <- julian(d)
jd
#> [1] 17970
#> attr(,"origin")
#> [1] "1970-01-01"
attr(jd, "origin")
#> [1] "1970-01-01"
as.integer(as.Date("1970-01-01"))
#> [1] 0
as.integer(as.Date("1970-01-02"))
#> [1] 1
as.integer(as.Date("1970-01-03"))
#> [1] 2
d <- as.Date("2019-03-15")
p <- as.POSIXlt(d)
p$mday        # Day of the month
#> [1] 15
p$mon         # Month (0 = January)
#> [1] 2
p$year + 1900 # Year
#> [1] 2019
d <- as.Date("2020-04-02")
as.POSIXlt(d)$wday
#> [1] 4
as.POSIXlt(d)$yday
#> [1] 92
as.POSIXlt(d)$year # Oops!
#> [1] 120
as.POSIXlt(d)$year + 1900
#> [1] 2020
s <- as.Date("2019-01-01")
e <- as.Date("2019-02-01")
seq(from = s, to = e, by = 1) # One month of dates
#>  [1] "2019-01-01" "2019-01-02" "2019-01-03" "2019-01-04" "2019-01-05"
#>  [6] "2019-01-06" "2019-01-07" "2019-01-08" "2019-01-09" "2019-01-10"
#> [11] "2019-01-11" "2019-01-12" "2019-01-13" "2019-01-14" "2019-01-15"
#> [16] "2019-01-16" "2019-01-17" "2019-01-18" "2019-01-19" "2019-01-20"
#> [21] "2019-01-21" "2019-01-22" "2019-01-23" "2019-01-24" "2019-01-25"
#> [26] "2019-01-26" "2019-01-27" "2019-01-28" "2019-01-29" "2019-01-30"
#> [31] "2019-01-31" "2019-02-01"
seq(from = s, by = 1, length.out = 7) # Dates, one week apart
#> [1] "2019-01-01" "2019-01-02" "2019-01-03" "2019-01-04" "2019-01-05"
#> [6] "2019-01-06" "2019-01-07"
seq(from = s, by = "month", length.out = 12)   # First of the month for one year
#>  [1] "2019-01-01" "2019-02-01" "2019-03-01" "2019-04-01" "2019-05-01"
#>  [6] "2019-06-01" "2019-07-01" "2019-08-01" "2019-09-01" "2019-10-01"
#> [11] "2019-11-01" "2019-12-01"
seq(from = s, by = "3 months", length.out = 4) # Quarterly dates for one year
#> [1] "2019-01-01" "2019-04-01" "2019-07-01" "2019-10-01"
seq(from = s, by = "year", length.out = 10)    # Year-start dates for one decade
#>  [1] "2019-01-01" "2020-01-01" "2021-01-01" "2022-01-01" "2023-01-01"
#>  [6] "2024-01-01" "2025-01-01" "2026-01-01" "2027-01-01" "2028-01-01"
seq(as.Date("2019-01-29"), by = "month", len = 3)
#> [1] "2019-01-29" "2019-03-01" "2019-03-29"
?Normal
?TDist
n <- 10
k <- 2
choose(n, k)
#> [1] 45
choose(5, 3)   # How many ways can we select 3 items from 5 items?
#> [1] 10
choose(50, 3)  # How many ways can we select 3 items from 50 items?
#> [1] 19600
choose(50, 30) # How many ways can we select 30 items from 50 items?
#> [1] 4.71e+13
items <- 2:5
k <- 2
combn(items, k)
#>      [,1] [,2] [,3] [,4] [,5] [,6]
#> [1,]    2    2    2    3    3    4
#> [2,]    3    4    5    4    5    5
combn(1:5, 3)
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,]    1    1    1    1    1    1    2    2    2     3
#> [2,]    2    2    2    3    3    4    3    3    4     4
#> [3,]    3    4    5    4    5    5    4    5    5     5
combn(c("T1", "T2", "T3", "T4", "T5"), 3)
#>      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
#> [1,] "T1" "T1" "T1" "T1" "T1" "T1" "T2" "T2" "T2" "T3"
#> [2,] "T2" "T2" "T2" "T3" "T3" "T4" "T3" "T3" "T4" "T4"
#> [3,] "T3" "T4" "T5" "T4" "T5" "T5" "T4" "T5" "T5" "T5"
runif(1)
#> [1] 0.915
rnorm(1)
#> [1] 1.53
runif(1)
#> [1] 0.83
runif(10)
#>  [1] 0.642 0.519 0.737 0.135 0.657 0.705 0.458 0.719 0.935 0.255
set.seed(42)
runif(1, min = -3, max = 3)      # One uniform variate between -3 and +3
#> [1] 2.49
rnorm(1)                         # One standard Normal variate
#> [1] 1.53
rnorm(1, mean = 100, sd = 15)    # One Normal variate, mean 100 and SD 15
#> [1] 114
rbinom(1, size = 10, prob = 0.5) # One binomial variate
#> [1] 5
rpois(1, lambda = 10)            # One Poisson variate
#> [1] 12
rexp(1, rate = 0.1)              # One exponential variate
#> [1] 3.14
rgamma(1, shape = 2, rate = 0.1) # One gamma variate
#> [1] 22.3
rnorm(3, mean = c(-10, 0, +10), sd = 1)
#> [1] -9.420 -0.658 11.555
means <- rnorm(30, mean = 0, sd = 0.2)
rnorm(30, mean = means, sd = 1)
#>  [1] -0.5549 -2.9232 -1.2203  0.6962  0.1673 -1.0779 -0.3138 -3.3165
#>  [9]  1.5952  0.8184 -0.1251  0.3601 -0.8142  0.1050  2.1264  0.6943
#> [17] -2.7771  0.9026  0.0389  0.2280 -0.5599  0.9572  0.1972  0.2602
#> [25] -0.4423  1.9707  0.4553  0.0467  1.5229  0.3176
set.seed(42) # Or use any other positive integer...
set.seed(165)   # Initialize generator to known state
runif(10)       # Generate ten random numbers
#>  [1] 0.116 0.450 0.996 0.611 0.616 0.426 0.666 0.168 0.788 0.442

set.seed(165)   # Reinitialize to the same known state
runif(10)       # Generate the same ten "random" numbers
#>  [1] 0.116 0.450 0.996 0.611 0.616 0.426 0.666 0.168 0.788 0.442
sample(set, n)
world_series <- read_csv("./data/world_series.csv")
sample(world_series$year, 10)
#>  [1] 2010 1961 1906 1992 1982 1948 1910 1973 1967 1931
sample(world_series$year, 10)
#>  [1] 1941 1973 1921 1958 1979 1946 1932 1919 1971 1974
set.seed(42)
x <- rnorm(1000, 4, 10)
medians <- numeric(1000)   # empty vector of 1000 numbers
for (i in 1:1000) {
  medians[i] <- median(sample(x, replace = TRUE))
}
ci <- quantile(medians, c(0.025, 0.975))
cat("95% confidence interval is (", ci, ")\n")
#> 95% confidence interval is ( 3.16 4.49 )
sample(set, n, replace = TRUE)
sample(c("H", "T"), 10, replace = TRUE)
#>  [1] "H" "T" "H" "T" "T" "T" "H" "T" "T" "H"
sample(c(FALSE, TRUE), 20, replace = TRUE)
#>  [1]  TRUE FALSE  TRUE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE FALSE
#> [12]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
sample(c(FALSE, TRUE), 20, replace = TRUE, prob = c(0.2, 0.8))
#>  [1]  TRUE  TRUE FALSE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE  TRUE
#> [12]  TRUE  TRUE  TRUE  TRUE  TRUE FALSE FALSE  TRUE  TRUE
rbinom(10, 1, 0.8)
#>  [1] 1 0 1 1 1 1 1 0 1 1
sample(v, size = length(v), replace = FALSE)
sample(1:10)
#>  [1]  7  3  6  1  5  2  4  8 10  9
dbinom(7, size = 10, prob = 0.5)
#> [1] 0.117
pbinom(7, size = 10, prob = 0.5)
#> [1] 0.945
pbinom(7, size = 10, prob = 0.5, lower.tail = FALSE)
#> [1] 0.0547
pbinom(7, size = 10, prob = 0.5) - pbinom(3, size = 10, prob = 0.5)
#> [1] 0.773
pbinom(c(3, 7), size = 10, prob = 0.5)
#> [1] 0.172 0.945
diff(pbinom(c(3, 7), size = 10, prob = 0.5))
#> [1] 0.773
pnorm(q = .8, mean = 0, sd = 1)
#> [1] 0.788
pnorm(66, mean = 70, sd = 3)
#> [1] 0.0912
pexp(20, rate = 1 / 40)
#> [1] 0.393
pexp(50, rate = 1 / 40, lower.tail = FALSE)
#> [1] 0.287
pexp(50, rate = 1 / 40) - pexp(20, rate = 1 / 40)
#> [1] 0.32
qnorm(0.05, mean = 100, sd = 15)
#> [1] 75.3
qnorm(0.025)
#> [1] -1.96
qnorm(0.975)
#> [1] 1.96
qnorm(c(0.025, 0.975))
#> [1] -1.96  1.96
dens <- data.frame(x = x,
                   y = d_____(x))
ggplot(dens, aes(x, y)) + geom_line()
library(ggplot2)

x <- seq(-3, +3, 0.1)
dens <- data.frame(x = x, y = dnorm(x))

ggplot(dens, aes(x, y)) + geom_line()
x <- seq(from = 0, to = 6, length.out = 100) # Define the density domains
ylim <- c(0, 0.6)

# Make a data.frame with densities of several distributions
df <- rbind(
  data.frame(x = x, dist_name = "Uniform"    , y = dunif(x, min   = 2, max = 4)),
  data.frame(x = x, dist_name = "Normal"     , y = dnorm(x, mean  = 3, sd = 1)),
  data.frame(x = x, dist_name = "Exponential", y = dexp(x, rate  = 1 / 2)),
  data.frame(x = x, dist_name = "Gamma"      , y = dgamma(x, shape = 2, rate = 1)) )

# Make a line plot like before, but use facet_wrap to create the grid
ggplot(data = df, aes(x = x, y = y)) +
  geom_line() +
  facet_wrap(~dist_name)   # facet and wrap by the variable dist_name
x <- seq(from = -3, to = 3, length.out = 100)
df <- data.frame(x = x, y = dnorm(x, mean = 0, sd = 1))

p <- ggplot(df, aes(x, y)) +
  geom_line() +
  labs(
    title = "Standard Normal Distribution",
    y = "Density",
    x = "Quantile"
  )
p
q75 <- quantile(df$x, .75)
q95 <- quantile(df$x, .95)

p +
  geom_ribbon(
    data = subset(df, x > q75 & x < q95),
    aes(ymax = y),
    ymin = 0,
    fill = "blue",
    colour = NA,
    alpha = 0.5
  )
summary(vec)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#>     0.0     0.5     1.0     1.6     1.9    33.0
summary(mat)
#>      Samp1           Samp2            Samp3
#>  Min.   :  1.0   Min.   :-2.943   Min.   : 0.04
#>  1st Qu.: 25.8   1st Qu.:-0.774   1st Qu.: 0.39
#>  Median : 50.5   Median :-0.052   Median : 0.85
#>  Mean   : 50.5   Mean   :-0.067   Mean   : 1.60
#>  3rd Qu.: 75.2   3rd Qu.: 0.684   3rd Qu.: 2.12
#>  Max.   :100.0   Max.   : 2.150   Max.   :13.18
summary(fac)
#> Maybe    No   Yes
#>    38    32    30
summary(char)
#>    Length     Class      Mode
#>       100 character character
suburbs <- read_csv("./data/suburbs.txt")
summary(suburbs)
#>      city              county             state
#>  Length:17          Length:17          Length:17
#>  Class :character   Class :character   Class :character
#>  Mode  :character   Mode  :character   Mode  :character
#>
#>
#>
#>       pop
#>  Min.   :   5428
#>  1st Qu.:  72616
#>  Median :  83048
#>  Mean   : 249770
#>  3rd Qu.: 102746
#>  Max.   :2853114
summary(vec_list)
#>   Length Class  Mode
#> x 100    -none- numeric
#> y 100    -none- numeric
#> z 100    -none- character
library(purrr)
map(vec_list, summary)
#> $x
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#>  -2.572  -0.686  -0.084  -0.043   0.660   2.413
#>
#> $y
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
#>  -1.752  -0.589   0.045   0.079   0.769   2.293
#>
#> $z
#>    Length     Class      Mode
#>       100 character character
mean(x > 3)
#> [1] 0.12
table(f1)
#> f1
#>  a  b  c  d  e
#> 14 23 24 21 18
table(f1, f2)
#>    f2
#> f1   f  g  h
#>   a  6  4  4
#>   b  7  9  7
#>   c  4 11  9
#>   d  7  8  6
#>   e  5 10  3
t1 <- sample(letters[9:11], 100, replace = TRUE)
table(t1)
#> t1
#>  i  j  k
#> 20 40 40
set.seed(42)
initial <- factor(sample(c("Yes", "No", "Maybe"), 100, replace = TRUE))
outcome <- factor(sample(c("Pass", "Fail"), 100, replace = TRUE))

table(initial)
#> initial
#> Maybe    No   Yes
#>    39    31    30

table(outcome)
#> outcome
#> Fail Pass
#>   56   44
table(initial, outcome)
#>        outcome
#> initial Fail Pass
#>   Maybe   23   16
#>   No      20   11
#>   Yes     13   17
summary(table(initial, outcome))
#> Number of cases in table: 100
#> Number of factors: 2
#> Test for independence of all factors:
#>  Chisq = 3, df = 2, p-value = 0.2
summary(table(initial, outcome))
#> Number of cases in table: 100
#> Number of factors: 2
#> Test for independence of all factors:
#>  Chisq = 3, df = 2, p-value = 0.2
quantile(vec, 0.95)
#>  95%
#> 1.43
quantile(vec)
#>      0%     25%     50%     75%    100%
#> -2.0247 -0.5915 -0.0693  0.4618  2.7019
vec <- runif(1000)
quantile(vec, .05)
#>     5%
#> 0.0451
quantile(vec, c(.05, .95))
#>     5%    95%
#> 0.0451 0.9363
quantile(vec)
#>       0%      25%      50%      75%     100%
#> 0.000405 0.235529 0.479543 0.737619 0.999379
mean(vec < 1.6)
#> [1] 0.948
scale(x)
#>          [,1]
#>  [1,]  0.8701
#>  [2,] -0.7133
#>  [3,] -1.0503
#>  [4,]  0.5790
#>  [5,] -0.6324
#>  [6,]  0.0991
#>  [7,]  2.1495
#>  [8,]  0.2481
#>  [9,] -0.8155
#> [10,] -0.7341
#> attr(,"scaled:center")
#> [1] 2.42
#> attr(,"scaled:scale")
#> [1] 2.11
(y - mean(x)) / sd(x)
#> [1] -0.633
t.test(x, mu = m)
x <- rnorm(75, mean = 100, sd = 15)
t.test(x, mu = 95)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = 3, df = 70, p-value = 0.005
#> alternative hypothesis: true mean is not equal to 95
#> 95 percent confidence interval:
#>   96.5 103.0
#> sample estimates:
#> mean of x
#>      99.7
t.test(x, mu = 100)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = -0.2, df = 70, p-value = 0.9
#> alternative hypothesis: true mean is not equal to 100
#> 95 percent confidence interval:
#>   96.5 103.0
#> sample estimates:
#> mean of x
#>      99.7
t.test(x)
t.test(x)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = 50, df = 50, p-value <2e-16
#> alternative hypothesis: true mean is not equal to 0
#> 95 percent confidence interval:
#>   94.2 101.5
#> sample estimates:
#> mean of x
#>      97.9
t.test(x, conf.level = 0.99)
#>
#>  One Sample t-test
#>
#> data:  x
#> t = 50, df = 50, p-value <2e-16
#> alternative hypothesis: true mean is not equal to 0
#> 99 percent confidence interval:
#>   92.9 102.8
#> sample estimates:
#> mean of x
#>      97.9
wilcox.test(x, conf.int = TRUE)
wilcox.test(x, conf.int = TRUE)
#>
#>  Wilcoxon signed rank test
#>
#> data:  x
#> V = 200, p-value = 0.1
#> alternative hypothesis: true location is not equal to 0
#> 95 percent confidence interval:
#>  -0.102  0.646
#> sample estimates:
#> (pseudo)median
#>          0.311
median(x)
#> [1] 0.314
prop.test(x, n, p)
prop.test(11, 20, 0.5, alternative = "greater")
#>
#>  1-sample proportions test with continuity correction
#>
#> data:  11 out of 20, null probability 0.5
#> X-squared = 0.05, df = 1, p-value = 0.4
#> alternative hypothesis: true p is greater than 0.5
#> 95 percent confidence interval:
#>  0.35 1.00
#> sample estimates:
#>    p
#> 0.55
prop.test(x, n)
prop.test(6, 9)
#> Warning in prop.test(6, 9): Chi-squared approximation may be incorrect
#>
#>  1-sample proportions test with continuity correction
#>
#> data:  6 out of 9, null probability 0.5
#> X-squared = 0.4, df = 1, p-value = 0.5
#> alternative hypothesis: true p is not equal to 0.5
#> 95 percent confidence interval:
#>  0.309 0.910
#> sample estimates:
#>     p
#> 0.667
prop.test(x, n, p, conf.level = 0.99)   # 99% confidence level
shapiro.test(x)
shapiro.test(x)
#>
#>  Shapiro-Wilk normality test
#>
#> data:  x
#> W = 1, p-value = 0.05
shapiro.test(y)
#>
#>  Shapiro-Wilk normality test
#>
#> data:  y
#> W = 0.7, p-value = 9e-12
library(tseries)
runs.test(as.factor(s))
s <- sample(c(0, 1), 100, replace = T)
runs.test(as.factor(s))
#>
#>  Runs Test
#>
#> data:  as.factor(s)
#> Standard Normal = 0.1, p-value = 0.9
#> alternative hypothesis: two.sided
s <- c(0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0)
runs.test(as.factor(s))
#>
#>  Runs Test
#>
#> data:  as.factor(s)
#> Standard Normal = -2, p-value = 0.02
#> alternative hypothesis: two.sided
t.test(x, y)
t.test(x, y, paired = TRUE)
load("./data/sat.rdata")
t.test(x, y)
#>
#>  Welch Two Sample t-test
#>
#> data:  x and y
#> t = -1, df = 200, p-value = 0.3
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -46.4  16.2
#> sample estimates:
#> mean of x mean of y
#>      1054      1069
t.test(x, y, paired = TRUE)
#>
#>  Paired t-test
#>
#> data:  x and y
#> t = -20, df = 100, p-value <2e-16
#> alternative hypothesis: true difference in means is not equal to 0
#> 95 percent confidence interval:
#>  -16.8 -13.5
#> sample estimates:
#> mean of the differences
#>                   -15.1
wilcox.test(x, y, paired = TRUE)
wilcox.test(x, y)
load(file = "./data/workers.rdata")
wilcox.test(fav, unfav, paired = TRUE)
#>
#>  Wilcoxon signed rank test
#>
#> data:  fav and unfav
#> V = 10, p-value = 1e-04
#> alternative hypothesis: true location shift is not equal to 0
cor.test(x, y)
cor.test(x, y, method = "spearman")
cor(x, y)
#> [1] 0.751
cor.test(x, y)
#>
#>  Pearson's product-moment correlation
#>
#> data:  x and y
#> t = 2, df = 4, p-value = 0.09
#> alternative hypothesis: true correlation is not equal to 0
#> 95 percent confidence interval:
#>  -0.155  0.971
#> sample estimates:
#>   cor
#> 0.751
#>
#>  2-sample test for equality of proportions with continuity
#>  correction
#>
#> data:  ns out of nt
#> X-squared = 5, df = 1, p-value = 0.03
#> alternative hypothesis: two.sided
#> 95 percent confidence interval:
#>  -0.3058 -0.0142
#> sample estimates:
#> prop 1 prop 2
#>   0.48   0.64
ns <- c(48, 64)
nt <- c(100, 100)
prop.test(ns, nt)
successes <- c(14, 10)
trials <- c(38, 40)
prop.test(successes, trials)
#>
#>  2-sample test for equality of proportions with continuity
#>  correction
#>
#> data:  successes out of trials
#> X-squared = 0.8, df = 1, p-value = 0.4
#> alternative hypothesis: two.sided
#> 95 percent confidence interval:
#>  -0.111  0.348
#> sample estimates:
#> prop 1 prop 2
#>  0.368  0.250
pairwise.t.test(x, f)   # x is the data, f is the grouping factor
pairwise.t.test(comb$values, comb$ind)
#>
#>  Pairwise comparisons using t tests with pooled SD
#>
#> data:  comb$values and comb$ind
#>
#>      fresh soph
#> soph 0.001 -
#> jrs  3e-04 0.592
#>
#> P value adjustment method: holm
ks.test(x, y)
#>
#>  Two-sample Kolmogorov-Smirnov test
#>
#> data:  x and y
#> D = 0.2, p-value = 0.04
#> alternative hypothesis: two-sided
ks.test(x, y)
#>
#>  Two-sample Kolmogorov-Smirnov test
#>
#> data:  x and y
#> D = 0.2, p-value = 0.04
#> alternative hypothesis: two-sided
z <- rnorm(100, mean = 4, sd = 6)
ks.test(x, z)
#>
#>  Two-sample Kolmogorov-Smirnov test
#>
#> data:  x and z
#> D = 0.1, p-value = 0.6
#> alternative hypothesis: two-sided
library(tidyverse)
df <- data.frame(x = 1:5, y = 1:5)
ggplot(df, aes(x, y)) +
  geom_point()
ggplot(df, aes(x, y)) +
  geom_point() +
  labs(
    title = "Simple Plot Example",
    subtitle = "with a subtitle",
    x = "x values",
    y = "y values"
  ) +
  theme(panel.background = element_rect(fill = "white", colour = "grey50"))
ggplot(df, aes(x, y)) +
  geom_point()
ggplot(mtcars, aes(hp, mpg)) +
  geom_point()
ggplot(df, aes(x, y)) +
  geom_point() +
  labs(title = "The Title",
       x = "X-axis Label",
       y = "Y-axis Label")
ggplot(mtcars, aes(hp, mpg)) +
  geom_point() +
  labs(title = "Cars: Horsepower vs. Fuel Economy",
       x = "HP",
       y = "Economy (miles per gallon)")
ggplot(df) +
  geom_point(aes(x, y)) +
  theme(panel.background = element_rect(fill = "white", colour = "grey50"))
g1 <- ggplot(mtcars, aes(hp, mpg)) +
  geom_point() +
  labs(title = "Cars: Horsepower vs. Fuel Economy",
       x = "HP",
       y = "Economy (miles per gallon)") +
  theme(panel.background = element_blank())
g1
g2 <- g1 + theme(panel.grid.major =
                   element_line(color = "red", linetype = 3)) +
  # linetype = 3 is dash
  theme(panel.grid.minor =
          element_line(color = "blue", linetype = 4))
  # linetype = 4 is dot dash
g2
g1 +
  theme(panel.grid.major = element_line(colour = "grey"))
ggplot(df, aes(x, y, shape = f)) +
  geom_point()
ggplot(data = iris,
       aes(x = Petal.Length,
           y = Petal.Width)) +
  geom_point()
ggplot(data = iris,
       aes(
         x = Petal.Length,
         y = Petal.Width,
         shape = Species,
         color = Species
       )) +
  geom_point()
g <- ggplot(data = iris,
       aes(x = Petal.Length,
           y = Petal.Width,
           shape="Point Name")) +
  geom_point()  +
  guides(shape=guide_legend(title="Legend Title"))
g
g <- ggplot(data = iris,
            aes(
              x = Petal.Length,
              y = Petal.Width,
              shape = Species,
              color = Species
            )) +
  geom_point() +
  theme(legend.position = "none")
g
g + theme(legend.position = "bottom")
g + theme(legend.position = c(.8, .2))
ggplot(df, aes(x, y)) +
  geom_point() +
  geom_smooth(method = "lm",
              formula = y ~ x,
              se = FALSE)
library(faraway)
data(strongx)

ggplot(strongx, aes(energy, crossx)) +
  geom_point()
g <- ggplot(strongx, aes(energy, crossx)) +
  geom_point()

g + geom_smooth(method = "lm",
                formula = y ~ x,
                se = FALSE)
g + geom_smooth(method = "lm",
                formula = y ~ x)
m <- lm(crossx ~ energy, data = strongx)

ggplot(strongx, aes(energy, crossx)) +
  geom_point() +
  geom_abline(
    intercept = m$coefficients[1],
    slope = m$coefficients[2]
  )
library(GGally)
ggpairs(df)
head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa
library(GGally)
ggpairs(iris)
plot(iris)
ggplot(df, aes(x, y)) +
  geom_point() +
  facet_wrap( ~ f)
data(Cars93, package = "MASS")
ggplot(data = Cars93, aes(MPG.city, Horsepower)) +
  geom_point() +
  facet_wrap( ~ Origin)
ggplot(data = df, aes(x, y)) +
  geom_bar(stat = "identity")
ford_cars <- Cars93 %>%
  filter(Manufacturer == "Ford")

ggplot(ford_cars, aes(Model, Horsepower)) +
  geom_bar(stat = "identity")
ggplot(airquality, aes(month.abb[Month], Temp)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
aq_data <- airquality %>%
  arrange(Month) %>%
  mutate(month_abb = fct_inorder(month.abb[Month]))

ggplot(aq_data, aes(month_abb, Temp)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
ggplot(df, aes(group, stat)) +
  geom_bar(stat = "identity") +
  geom_errorbar(aes(ymin = lower, ymax = upper), width = .2)
aq_data <- airquality %>%
  arrange(Month) %>%
  mutate(month_abb = fct_inorder(month.abb[Month]))
ggplot(aq_data, aes(month_abb, Temp)) +
  geom_bar(stat = "summary",
           fun.y = "mean",
           fill = "cornflowerblue") +
  stat_summary(fun.data = mean_se, geom = "errorbar") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
ggplot(aq_data, aes(reorder(month_abb,-Temp, mean), Temp)) +
  geom_bar(stat = "summary",
           fun.y = "mean",
           fill = "tomato"           ) +
  stat_summary(fun.data = mean_se, geom = "errorbar") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)")
ggplot(df, aes(x, y, fill = group))
aq_data <- airquality %>%
  arrange(Month) %>%
  mutate(month_abb = fct_inorder(month.abb[Month]))

ggplot(data = aq_data, aes(month_abb, Temp, fill = month_abb)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)") +
  scale_fill_brewer(palette = "Paired")
ggplot(airquality, aes(month.abb[Month], Temp, fill = ..y..)) +
  geom_bar(stat = "summary", fun.y = "mean") +
  labs(title = "Mean Temp by Month",
       x = "",
       y = "Temp (deg. F)",
       fill = "Temp")
ggplot(df, aes(x, y)) +
  geom_point()
ggplot(df, aes(x , y)) +
  geom_point() +
  geom_line()
ggplot(economics, aes(date , unemploy)) +
  geom_point() +
  geom_line()
ggplot(df, aes(x, y)) +
  geom_line(linetype = 2,
            size = 2,
            col = "red")
x <- 1:10
y1 <- x**1.5
y2 <- x**2
y3 <- x**2.5
df <- data.frame(x, y1, y2, y3)
head(df, 3)
#>   x   y1 y2    y3
#> 1 1 1.00  1  1.00
#> 2 2 2.83  4  5.66
#> 3 3 5.20  9 15.59
df_long <- gather(df, bucket, y, -x)
head(df_long, 3)
#>   x bucket    y
#> 1 1     y1 1.00
#> 2 2     y1 2.83
#> 3 3     y1 5.20
tail(df_long, 3)
#>     x bucket   y
#> 28  8     y3 181
#> 29  9     y3 243
#> 30 10     y3 316
ggplot(df_long, aes(x, y, col = bucket)) +
  geom_line()
ggplot(df, aes(x, y1, size = y2)) +
  geom_line() +
  scale_size(name = "Thickness based on y2")
# example data
n <- 20

x1 <- 1:n
y1 <- rnorm(n, 0, .5)
df1 <- data.frame(x1, y1)

x2 <- (.5 * n):((1.5 * n) - 1)
y2 <- rnorm(n, 1, .5)
df2 <- data.frame(x2, y2)
ggplot() +
  geom_line(data = df1, aes(x = x1, y = y1), color = "darkblue") +
  geom_line(data = df2, aes(x = x2, y = y2), linetype = "dashed")
ggplot() +
  geom_line(data = df1, aes(x = x1, y = y1), color = "darkblue") +
  geom_line(data = df2, aes(x = x2, y = y2), linetype = "dashed") +
  xlim(0, 35) +
  ylim(-2, 2)
# using the data.frame df1 from the prior recipe
ggplot(df1) +
  aes(x = x1, y = y1) +
  geom_point() +
  geom_vline(
    xintercept = 10,
    color = "red",
    linetype = "dashed",
    size = 1.5
  ) +
  geom_hline(yintercept = 0, color = "blue")
samp <- rnorm(1000)
samp_df <- data.frame(samp, x = 1:length(samp))

mean_line <- mean(samp_df$samp)
sd_lines <- mean_line + c(-2, -1, +1, +2) * sd(samp_df$samp)

ggplot(samp_df) +
  aes(x = x, y = samp) +
  geom_point() +
  geom_hline(yintercept = mean_line, color = "darkblue") +
  geom_hline(yintercept = sd_lines, linetype = "dotted")
ggplot(samp_df) +
  aes(y = samp) +
  geom_boxplot()
ggplot(samp_df) +
  aes(y = samp) +
  geom_boxplot() +
  coord_flip()
ggplot(df) +
  aes(x = factor, y = values) +
  geom_boxplot()
data(UScereal, package = "MASS")

ggplot(UScereal) +
  aes(x = as.factor(shelf), y = sugars) +
  geom_boxplot() +
  labs(
    title = "Sugar Content by Shelf",
    x = "Shelf",
    y = "Sugar (grams per portion)"
  )
data(Cars93, package = "MASS")

ggplot(Cars93) +
  geom_histogram(aes(x = MPG.city))
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(Cars93) +
  geom_histogram(aes(x = MPG.city), bins = 13)
ggplot(Cars93) +
  aes(x = MPG.city) +
  geom_histogram(aes(y = ..density..), bins = 21) +
  geom_density()
samp <- rgamma(500, 2, 2)

ggplot() +
  aes(x = samp) +
  geom_histogram(aes(y = ..density..), bins = 10) +
  geom_density()
df <- data.frame(x = rnorm(100))

ggplot(df, aes(sample = x)) +
  stat_qq() +
  stat_qq_line()
ggplot(Cars93, aes(sample = Price)) +
  stat_qq() +
  stat_qq_line()
ggplot(Cars93, aes(sample = log(Price))) +
  stat_qq() +
  stat_qq_line()
df_t <- data.frame(y = rt(100, 5))
est_df <- as.list(MASS::fitdistr(df_t$y, "t")$estimate)[["df"]]
#> Warning in log(s): NaNs produced

#> Warning in log(s): NaNs produced

#> Warning in log(s): NaNs produced
est_df
#> [1] 19.5
ggplot(df_t) +
  aes(sample = y) +
  geom_qq(distribution = qt, dparams = est_df) +
  stat_qq_line(distribution = qt, dparams = est_df)
rate <- 1 / 10
n <- 1000
df_exp <- data.frame(y = rexp(n, rate = rate))
est_exp <- as.list(MASS::fitdistr(df_exp$y, "exponential")$estimate)[["rate"]]
est_exp
#> [1] 0.101
ggplot(df_exp) +
  aes(sample = y) +
  geom_qq(distribution = qexp, dparams = est_exp) +
  stat_qq_line(distribution = qexp, dparams = est_exp)
df <- data.frame(x = rnorm(200), y = rnorm(200))

ggplot(df) +
  aes(x = x, y = y) +
  geom_point(color = "blue")
df <- data.frame(
  x = 1:100,
  y = rnorm(100)
)

ggplot(df) +
  aes(x, y) +
  geom_point()
shade <- if_else(df$y >= 0, "black", "gray")

ggplot(df) +
  aes(x, y) +
  geom_point(color = shade)
ggplot(data.frame(x = c(-3, 3))) +
  aes(x) +
  stat_function(fun = sin)
ggplot(data.frame(x = c(-3.5, 3.5))) +
  aes(x) +
  stat_function(fun = dnorm) +
  ggtitle("Std. Normal Density")
f <- function(x) exp(-abs(x)) * sin(2 * pi * x)
ggplot(data.frame(x = c(-3.5, 3.5))) +
  aes(x) +
  stat_function(fun = f) +
  ggtitle("Dampened Sine Wave")
par(ask = TRUE)
par(ask = FALSE)
Hit <Return> to see next plot:
par(ask = TRUE)

for (i in (11:15)) {
  g <- ggplot(data.frame(x = rnorm(i), y = 1:i)) +
    aes(x, y) +
    geom_point()
  print(g)
}

# don't forget to turn ask off after you're done
par(ask = FALSE)
# example data
z <- rnorm(1000)
y <- runif(1000)

# plot elements
p1 <- ggplot() +
  geom_point(aes(x = 1:1000, y = z))
p2 <- ggplot() +
  geom_point(aes(x = 1:1000, y = y))
p3 <- ggplot() +
  geom_density(aes(z))
p4 <- ggplot() +
  geom_density(aes(y))
devtools::install_github("thomasp85/patchwork")
library(patchwork)
p1 + p2 + p3 + p4
p3 / (p1 + p2 + p4)
library(patchwork)

df <- data.frame(x = c(0, 1))

g1 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, 2, 4)
  ) +
  ggtitle("First")

g2 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, 4, 1)
  ) +
  ggtitle("Second")

g3 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, 1, 1)
  ) +
  ggtitle("Third")

g4 <- ggplot(df) +
  aes(x) +
  stat_function(
    fun = function(x)
      dbeta(x, .5, .5)
  ) +
  ggtitle("Fourth")

g1 + g2 + g3 + g4 + plot_layout(ncol = 2, byrow = TRUE)
g1 + g2 + g3 + g4 + plot_layout(ncol = 2, byrow = FALSE)
ggsave("filename.jpg")
ggsave("g1.png", plot = g1, units = "in", width = 5, height = 4)
set.seed(42)
x <- rnorm(100)
e <- rnorm(100, mean=0, sd=5)
y <- 5 + 15 * x + e
set.seed(42)
x <- rnorm(100)
e <- rnorm(100, mean = 0, sd = 5)
y <- 5 + 15 * x + e

lm(y ~ x)
#>
#> Call:
#> lm(formula = y ~ x)
#>
#> Coefficients:
#> (Intercept)            x
#>        4.56        15.14
Coefficients:
(Intercept)            x
      4.558       15.136
df <- data.frame(x, y)
head(df)
#>        x     y
#> 1  1.371 31.57
#> 2 -0.565  1.75
#> 3  0.363  5.43
#> 4  0.633 23.74
#> 5  0.404  7.73
#> 6 -0.106  3.94
lm(y ~ x, data = df)          # Take x and y from df
#>
#> Call:
#> lm(formula = y ~ x, data = df)
#>
#> Coefficients:
#> (Intercept)            x
#>        4.56        15.14
lm(y ~ u + v + w)
set.seed(42)
u <- rnorm(100)
v <- rnorm(100, mean = 3,  sd = 2)
w <- rnorm(100, mean = -3, sd = 1)
e <- rnorm(100, mean = 0,  sd = 3)

y <- 5 + 4 * u + 3 * v + 2 * w + e

lm(y ~ u + v + w)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Coefficients:
#> (Intercept)            u            v            w
#>        4.77         4.17         3.01         1.91
df <- data.frame(y, u, v, w)
head(df)
#>       y      u     v     w
#> 1 16.67  1.371 5.402 -5.00
#> 2 14.96 -0.565 5.090 -2.67
#> 3  5.89  0.363 0.994 -1.83
#> 4 27.95  0.633 6.697 -0.94
#> 5  2.42  0.404 1.666 -4.38
#> 6  5.73 -0.106 3.211 -4.15
lm(y ~ u + v + w, data = df)
#>
#> Call:
#> lm(formula = y ~ u + v + w, data = df)
#>
#> Coefficients:
#> (Intercept)            u            v            w
#>        4.77         4.17         3.01         1.91
m <- lm(y ~ u + v + w)
lm(y ~ u + v + w)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Coefficients:
#> (Intercept)            u            v            w
#>        4.77         4.17         3.01         1.91
m <- lm(y ~ u + v + w)
summary(m)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -5.383 -1.760 -0.312  1.856  6.984
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    4.770      0.969    4.92  3.5e-06 ***
#> u              4.173      0.260   16.07  < 2e-16 ***
#> v              3.013      0.148   20.31  < 2e-16 ***
#> w              1.905      0.266    7.15  1.7e-10 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.66 on 96 degrees of freedom
#> Multiple R-squared:  0.885,  Adjusted R-squared:  0.882
#> F-statistic:  247 on 3 and 96 DF,  p-value: <2e-16
    coef(m)
#> (Intercept)           u           v           w
#>        4.77        4.17        3.01        1.91
    confint(m)
#>             2.5 % 97.5 %
#> (Intercept)  2.85   6.69
#> u            3.66   4.69
#> v            2.72   3.31
#> w            1.38   2.43
    resid(m)
#>       1       2       3       4       5       6       7       8       9
#> -0.5675  2.2880  0.0972  2.1474 -0.7169 -0.3617  1.0350  2.8040 -4.2496
#>      10      11      12      13      14      15      16      17      18
#> -0.2048 -0.6467 -2.5772 -2.9339 -1.9330  1.7800 -1.4400 -2.3989  0.9245
#>      19      20      21      22      23      24      25      26      27
#> -3.3663  2.6890 -1.4190  0.7871  0.0355 -0.3806  5.0459 -2.5011  3.4516
#>      28      29      30      31      32      33      34      35      36
#>  0.3371 -2.7099 -0.0761  2.0261 -1.3902 -2.7041  0.3953  2.7201 -0.0254
#>      37      38      39      40      41      42      43      44      45
#> -3.9887 -3.9011 -1.9458 -1.7701 -0.2614  2.0977 -1.3986 -3.1910  1.8439
#>      46      47      48      49      50      51      52      53      54
#>  0.8218  3.6273 -5.3832  0.2905  3.7878  1.9194 -2.4106  1.6855 -2.7964
#>      55      56      57      58      59      60      61      62      63
#> -1.3348  3.3549 -1.1525  2.4012 -0.5320 -4.9434 -2.4899 -3.2718 -1.6161
#>      64      65      66      67      68      69      70      71      72
#> -1.5119 -0.4493 -0.9869  5.6273 -4.4626 -1.7568  0.8099  5.0320  0.1689
#>      73      74      75      76      77      78      79      80      81
#>  3.5761 -4.8668  4.2781 -2.1386 -0.9739 -3.6380  0.5788  5.5664  6.9840
#>      82      83      84      85      86      87      88      89      90
#> -3.5119  1.2842  4.1445 -0.4630 -0.7867 -0.7565  1.6384  3.7578  1.8942
#>      91      92      93      94      95      96      97      98      99
#>  0.5542 -0.8662  1.2041 -1.7401 -0.7261  3.2701  1.4012  0.9476 -0.9140
#>     100
#>  2.4278
    deviance(m)
#> [1] 679
    anova(m)
#> Analysis of Variance Table
#>
#> Response: y
#>           Df Sum Sq Mean Sq F value  Pr(>F)
#> u          1   1776    1776   251.0 < 2e-16 ***
#> v          1   3097    3097   437.7 < 2e-16 ***
#> w          1    362     362    51.1 1.7e-10 ***
#> Residuals 96    679       7
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lm(y ~ u + v + w))
lm(y ~ u + v + w) %>%
  summary
summary(m)
#>
#> Call:
#> lm(formula = y ~ u + v + w)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -5.383 -1.760 -0.312  1.856  6.984
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    4.770      0.969    4.92  3.5e-06 ***
#> u              4.173      0.260   16.07  < 2e-16 ***
#> v              3.013      0.148   20.31  < 2e-16 ***
#> w              1.905      0.266    7.15  1.7e-10 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.66 on 96 degrees of freedom
#> Multiple R-squared:  0.885,  Adjusted R-squared:  0.882
#> F-statistic:  247 on 3 and 96 DF,  p-value: <2e-16
    summary(m)$call
    # Residuals:
    #     Min      1Q  Median      3Q     Max
    # -5.3832 -1.7601 -0.3115  1.8565  6.9840
summary(m)$coefficients
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)     4.77      0.969    4.92 3.55e-06
#> u               4.17      0.260   16.07 5.76e-29
#> v               3.01      0.148   20.31 1.58e-36
#> w               1.91      0.266    7.15 1.71e-10
+
+
Residual standard error::
+
[source, r]
# Residual standard error: 2.66 on 96 degrees of freedom
+
-------------------------------------------------------------------
This reports the standard error of the residuals (*σ*)—that is, the
sample standard deviation of *ε*.
-------------------------------------------------------------------

_R_^2^ (coefficient of determination)::
+
[source, r]
# Multiple R-squared:  0.8851,  Adjusted R-squared:  0.8815
+
-------------------------------------------------------------------
*R*^2^ is a measure of the model’s quality. Bigger is better.
Mathematically, it is the fraction of the variance of *y* that is
explained by the regression model. The remaining variance is not
explained by the model, so it must be due to other factors (i.e.,
unknown variables or sampling variability). In this case, the model
explains 0.4981 (49.81%) of the variance of *y*, and the remaining
0.5019 (50.19%) is unexplained.

That being said, we strongly suggest using the adjusted rather than
the basic *R*^2^. The adjusted value accounts for the number of
variables in your model and so is a more realistic assessment of
its effectiveness. In this case, then, we would use 0.8815,
not 0.8851s
-------------------------------------------------------------------

_F_ statistic::
+
[source, r]
# F-statistic: 246.6 on 3 and 96 DF,  p-value: < 2.2e-16
+
--------------------------------------------------------------------
The *F* statistic tells you whether the model is significant
or insignificant. The model is significant if any of the
coefficients are nonzero (i.e., if *β*~*i*~ ≠ 0 for some *i*). It is
insignificant if all coefficients are zero (*β*~1~ = *β*~2~ = … =
*β*~*n*~ = 0).

Conventionally, a *p*-value of less than 0.05 indicates that the
model is likely significant (one or more *β*~*i*~ are nonzero)
whereas values exceeding 0.05 indicate that the model is likely
not significant. Here, the probability is only 0.000391 that our
model is insignificant. That’s good.

Most people look at the *R*^2^ statistic first. The statistician
wisely starts with the *F* statistic, for if the model is not
significant then nothing else matters.
--------------------------------------------------------------------

[[see_also-id240]]
==== See Also

See <<recipe-id231>> for more on extracting statistics and information from the
model object.

[[recipe-id205]]
=== Performing Linear Regression Without an Intercept

[[problem-id205]]
==== Problem

You want to perform a linear regression, but you want to force the
intercept to be zero.

[[solution-id205]]
==== Solution

Add "`+` `0`" to the righthand side of your regression formula. That
will force `lm` to fit the model with a zero intercept:

[source, r]
The corresponding regression equation is:

++++
<ul class="simplelist">
  <li><em>y</em><sub><em>i</em></sub> = <em>βx</em><sub><em>i</em></sub> + <em>ε</em><sub><em>i</em></sub></li>
</ul>
++++

[[discussion-id205]]
==== Discussion

Linear regression ordinarily includes an intercept term, so that is the
default in R. In rare cases, however, you may want to fit the data while
assuming that the intercept is zero. In this you make a modeling
assumption: when _x_ is zero, _y_ should be zero.

When you force a zero intercept, the `lm` output includes a coefficient
for _x_ but no intercept for _y_, as shown here:

[source, r]
We strongly suggest you check that modeling assumption before
proceeding. Perform a regression with an intercept; then see if the
intercept could plausibly be zero. Check the intercept’s confidence
interval. In this example, the confidence interval is (6.26, 8.84):

[source, r]
Because the confidence interval does not contain zero, it is NOT
statistically plausible that the intercept could be zero. So in this
case, it is not reasonable to rerun the regression while forcing a zero
intercept.

[[title-highcor]]
=== Regressing Only Variables that Highly Correlate with your Dependent Variable

[[problem-highcor]]
==== Problem

You have a data frame with many variables and you want to build a
multiple linear regression using only the variables that are highly
correlated to your response (dependent) variable.

[[solution-highcor]]
==== Solution

If `df` is our data frame containing both our response (dependent) and
all our predictor (independent) variables and `dep_var` is our response
variable, we can figure out our best predictors and then use them in a
linear regression. If we want the top 4 predictor variables, we can use
this recipe:

[source, r]
This recipe is a combination of many differnt pieces of logic used
elsewhere in this book. We will describe each step here then walk
through it in the discussion using some example data.

First we drop the response variable out of our pipe chain so that we
have only our predictor variables in our data flow:

[source, r]
Then we use `map_dbl` from `purrr` to perform a pairwise correlation on
each column relative to the response variable.

[source, r]
map_dbl(cor, y = df$dep_var) %>%
We then take the resulting correlations and sort them in decreasing
order:

[source, r]
sort(decreasing = TRUE) %>%
We want only the top 4 correlated variables so we select the top 4
records in the resulting vector:

[source, r]
.[1:4] %>%
And we don't need the correlation values, only the names of the rows
which are the variable names from our original data frame `df`:

[source, r]
Then we can pass those names into our subsetting brackets to select only
the columns with names matching the ones we want:

[source, r]
mod <- lm(df$dep_var ~ as.matrix(best_pred))
# loads the pred data frame
load("./data/pred.rdata")

pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp)
#> pred1 pred2 pred3 pred4 pred5 pred6
#> 0.573 0.279 0.753 0.799 0.322 0.607
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE)
#> pred4 pred3 pred6 pred1 pred5 pred2
#> 0.799 0.753 0.607 0.573 0.322 0.279
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4]
#> pred4 pred3 pred6 pred1
#> 0.799 0.753 0.607 0.573
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4] %>%
  names
#> [1] "pred4" "pred3" "pred6" "pred1"
pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4] %>%
  names %>%
  pred[.] %>%
  head
#>    pred4   pred3  pred6  pred1
#> 1  7.252  1.5127  0.560  0.206
#> 2  2.076  0.2579 -0.124 -0.361
#> 3 -0.649  0.0884  0.657  0.758
#> 4  1.365 -0.1209  0.122 -0.727
#> 5 -5.444 -1.1943 -0.391 -1.368
#> 6  2.554  0.6120  1.273  0.433
best_pred <- pred %>%
  select(-resp) %>%
  map_dbl(cor, y = pred$resp) %>%
  sort(decreasing = TRUE) %>%
  .[1:4] %>%
  names %>%
  pred[.]

mod <- lm(pred$resp ~ as.matrix(best_pred))
summary(mod)
#>
#> Call:
#> lm(formula = pred$resp ~ as.matrix(best_pred))
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -1.485 -0.619  0.189  0.562  1.398
#>
#> Coefficients:
#>                           Estimate Std. Error t value Pr(>|t|)
#> (Intercept)                  1.117      0.340    3.28   0.0051 **
#> as.matrix(best_pred)pred4    0.523      0.207    2.53   0.0231 *
#> as.matrix(best_pred)pred3   -0.693      0.870   -0.80   0.4382
#> as.matrix(best_pred)pred6    1.160      0.682    1.70   0.1095
#> as.matrix(best_pred)pred1    0.343      0.359    0.95   0.3549
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.927 on 15 degrees of freedom
#> Multiple R-squared:  0.838,  Adjusted R-squared:  0.795
#> F-statistic: 19.4 on 4 and 15 DF,  p-value: 8.59e-06
lm(y ~ u*v)
y ~ u * v
y ~ u * v * w
y ~ u + v + w + u:v:w
y ~ u * v
y ~ u + v + u:v
y ~ (u + v) ^ 2
full.model <- lm(y ~ x1 + x2 + x3 + x4)
reduced.model <- step(full.model, direction = "backward")
min.model <- lm(y ~ 1)
fwd.model <-
  step(min.model,
       direction = "forward",
       scope = (~ x1 + x2 + x3 + x4))
# example data
set.seed(4)
n <- 150
x1 <- rnorm(n)
x2 <- rnorm(n, 1, 2)
x3 <- rnorm(n, 3, 1)
x4 <- rnorm(n,-2, 2)
e <- rnorm(n, 0, 3)
y <- 4 + x1 + 5 * x3 + e

# build the model
full.model <- lm(y ~ x1 + x2 + x3 + x4)
summary(full.model)
#>
#> Call:
#> lm(formula = y ~ x1 + x2 + x3 + x4)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -8.032 -1.774  0.158  2.032  6.626
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)  3.40224    0.80767    4.21  4.4e-05 ***
#> x1           0.53937    0.25935    2.08    0.039 *
#> x2           0.16831    0.12291    1.37    0.173
#> x3           5.17410    0.23983   21.57  < 2e-16 ***
#> x4          -0.00982    0.12954   -0.08    0.940
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.92 on 145 degrees of freedom
#> Multiple R-squared:  0.77,   Adjusted R-squared:  0.763
#> F-statistic:  121 on 4 and 145 DF,  p-value: <2e-16
reduced.model <- step(full.model, direction="backward")
#> Start:  AIC=327
#> y ~ x1 + x2 + x3 + x4
#>
#>        Df Sum of Sq  RSS AIC
#> - x4    1         0 1240 325
#> - x2    1        16 1256 327
#> <none>              1240 327
#> - x1    1        37 1277 329
#> - x3    1      3979 5219 540
#>
#> Step:  AIC=325
#> y ~ x1 + x2 + x3
#>
#>        Df Sum of Sq  RSS AIC
#> - x2    1        16 1256 325
#> <none>              1240 325
#> - x1    1        37 1277 327
#> - x3    1      3988 5228 539
#>
#> Step:  AIC=325
#> y ~ x1 + x3
#>
#>        Df Sum of Sq  RSS AIC
#> <none>              1256 325
#> - x1    1        44 1300 328
#> - x3    1      3974 5230 537
summary(reduced.model)
#>
#> Call:
#> lm(formula = y ~ x1 + x3)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -8.148 -1.850 -0.055  2.026  6.550
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    3.648      0.751    4.86    3e-06 ***
#> x1             0.582      0.255    2.28    0.024 *
#> x3             5.147      0.239   21.57   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.92 on 147 degrees of freedom
#> Multiple R-squared:  0.767,  Adjusted R-squared:  0.763
#> F-statistic:  241 on 2 and 147 DF,  p-value: <2e-16
min.model <- lm(y ~ 1)
fwd.model <- step(
  min.model,
  direction = "forward",
  scope = (~ x1 + x2 + x3 + x4),
  trace = 0
)
summary(fwd.model)
#>
#> Call:
#> lm(formula = y ~ x3 + x1)
#>
#> Residuals:
#>    Min     1Q Median     3Q    Max
#> -8.148 -1.850 -0.055  2.026  6.550
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)    3.648      0.751    4.86    3e-06 ***
#> x3             5.147      0.239   21.57   <2e-16 ***
#> x1             0.582      0.255    2.28    0.024 *
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 2.92 on 147 degrees of freedom
#> Multiple R-squared:  0.767,  Adjusted R-squared:  0.763
#> F-statistic:  241 on 2 and 147 DF,  p-value: <2e-16
full.model <- lm(y ~ (x1 + x2 + x3 + x4) ^ 4)
reduced.model <- step(full.model, direction = "backward")
#> Start:  AIC=337
#> y ~ (x1 + x2 + x3 + x4)^4
#>
#>               Df Sum of Sq  RSS AIC
#> - x1:x2:x3:x4  1    0.0321 1145 335
#> <none>                     1145 337
#>
#> Step:  AIC=335
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x3:x4 + x1:x2:x3 + x1:x2:x4 + x1:x3:x4 + x2:x3:x4
#>
#>            Df Sum of Sq  RSS AIC
#> - x2:x3:x4  1      0.76 1146 333
#> - x1:x3:x4  1      8.37 1154 334
#> <none>                  1145 335
#> - x1:x2:x4  1     20.95 1166 336
#> - x1:x2:x3  1     25.18 1170 336
#>
#> Step:  AIC=333
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x3:x4 + x1:x2:x3 + x1:x2:x4 + x1:x3:x4
#>
#>            Df Sum of Sq  RSS AIC
#> - x1:x3:x4  1      8.74 1155 332
#> <none>                  1146 333
#> - x1:x2:x4  1     21.72 1168 334
#> - x1:x2:x3  1     26.51 1172 334
#>
#> Step:  AIC=332
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x3:x4 + x1:x2:x3 + x1:x2:x4
#>
#>            Df Sum of Sq  RSS AIC
#> - x3:x4     1      0.29 1155 330
#> <none>                  1155 332
#> - x1:x2:x4  1     23.24 1178 333
#> - x1:x2:x3  1     31.11 1186 334
#>
#> Step:  AIC=330
#> y ~ x1 + x2 + x3 + x4 + x1:x2 + x1:x3 + x1:x4 + x2:x3 + x2:x4 +
#>     x1:x2:x3 + x1:x2:x4
#>
#>            Df Sum of Sq  RSS AIC
#> <none>                  1155 330
#> - x1:x2:x4  1      23.4 1178 331
#> - x1:x2:x3  1      31.5 1187 332
lm(y ~ x1, subset=1:100)          # Use only x[1:100]
## example data
n <- 1000
x <- rnorm(n)
e <- rnorm(n, 0, .5)
y <- 3 + 2 * x + e
lm(y ~ x, subset = 1:500)
#>
#> Call:
#> lm(formula = y ~ x, subset = 1:500)
#>
#> Coefficients:
#> (Intercept)            x
#>           3            2
lm(y ~ x, subset = 1:floor(length(x) / 2))
#>
#> Call:
#> lm(formula = y ~ x, subset = 1:floor(length(x)/2))
#>
#> Coefficients:
#> (Intercept)            x
#>           3            2
load('./data/lab_df.rdata')
lm(y ~ x, subset = (lab == "NJ"), data = lab_df)
#>
#> Call:
#> lm(formula = y ~ x, data = lab_df, subset = (lab == "NJ"))
#>
#> Coefficients:
#> (Intercept)            x
#>        2.58         5.03
lm(y ~ u + v)    # Not quite right
lm(y ~ u + u ^ 2)  # That's an interaction, not a quadratic term
lm(y ~ I(u + v))
lm(y ~ u + I(u ^ 2))
load('./data/df_squared.rdata')
m <- lm(y ~ u + I(u ^ 2), data = df_squared)
predict(m, newdata = data.frame(u = 13.4))
#>   1
#> 877
lm(y ~ poly(x, 3, raw = TRUE))
x_sq <- x ^ 2
x_cub <- x ^ 3
m <- lm(y ~ x + x_sq + x_cub)
m <- lm(y ~ poly(x, 3, raw = TRUE))
lm(y ~ x + x^2 + x^3)     # Does not do what you think!
lm(y ~ x + I(x ^ 2) + I(x ^ 3))
lm(log(y) ~ x)
# read in our example data
load(file = './data/df_decay.rdata')
z <- df_decay$z
t <- df_decay$time

# transform and model
m <- lm(log(z) ~ t)
summary(m)
#>
#> Call:
#> lm(formula = log(z) ~ t)
#>
#> Residuals:
#>     Min      1Q  Median      3Q     Max
#> -0.4479 -0.0993  0.0049  0.0978  0.2802
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)   0.6887     0.0306    22.5   <2e-16 ***
#> t            -2.0118     0.0351   -57.3   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.148 on 98 degrees of freedom
#> Multiple R-squared:  0.971,  Adjusted R-squared:  0.971
#> F-statistic: 3.28e+03 on 1 and 98 DF,  p-value: <2e-16
lm(sqrt(y) ~ month)
lm(y ~ sqrt(x))
lm(log(y) ~ log(x))
library(MASS)
m <- lm(y ~ x)
boxcox(m)
set.seed(9)
x <- 10:100
eps <- rnorm(length(x), sd = 5)
y <- (x + eps) ^ (-1 / 1.5)
m <- lm(y ~ x)
summary(m)
#>
#> Call:
#> lm(formula = y ~ x)
#>
#> Residuals:
#>      Min       1Q   Median       3Q      Max
#> -0.04032 -0.01633 -0.00792  0.00996  0.14516
#>
#> Coefficients:
#>              Estimate Std. Error t value Pr(>|t|)
#> (Intercept)  0.166885   0.007078    23.6   <2e-16 ***
#> x           -0.001465   0.000116   -12.6   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 0.0291 on 89 degrees of freedom
#> Multiple R-squared:  0.641,  Adjusted R-squared:  0.637
#> F-statistic:  159 on 1 and 89 DF,  p-value: <2e-16
plot(m, which = 1)       # Plot only the fitted vs residuals
library(MASS)
#>
#> Attaching package: 'MASS'
#> The following object is masked from 'package:dplyr':
#>
#>     select
bc <- boxcox(m)
which.max(bc$y)
#> [1] 13
lambda <- bc$x[which.max(bc$y)]
lambda
#> [1] -1.52
z <- y ^ lambda
m2 <- lm(z ~ x)
summary(m2)
#>
#> Call:
#> lm(formula = z ~ x)
#>
#> Residuals:
#>     Min      1Q  Median      3Q     Max
#> -13.459  -3.711  -0.228   2.206  14.188
#>
#> Coefficients:
#>             Estimate Std. Error t value Pr(>|t|)
#> (Intercept)  -0.6426     1.2517   -0.51     0.61
#> x             1.0514     0.0205   51.20   <2e-16 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> Residual standard error: 5.15 on 89 degrees of freedom
#> Multiple R-squared:  0.967,  Adjusted R-squared:  0.967
#> F-statistic: 2.62e+03 on 1 and 89 DF,  p-value: <2e-16
m2 <- lm(I(y ^ lambda) ~ x)
load(file = './data/conf.rdata')
m <- lm(y ~ x1 + x2)
confint(m)
#>             2.5 % 97.5 %
#> (Intercept) -3.90   6.47
#> x1          -2.58   6.24
#> x2           4.67   5.17
confint(m)
#>             2.5 % 97.5 %
#> (Intercept) -3.90   6.47
#> x1          -2.58   6.24
#> x2           4.67   5.17
confint(m, level = 0.99)
#>             0.5 % 99.5 %
#> (Intercept) -5.72   8.28
#> x1          -4.12   7.79
#> x2           4.58   5.26
m <- lm(y ~ x1 + x2)
plot(m, which = 1)
m <- lm(y ~ x1 + x2)
plot(m)
library(car)
#> Loading required package: carData
#>
#> Attaching package: 'car'
#> The following object is masked from 'package:dplyr':
#>
#>     recode
#> The following object is masked from 'package:purrr':
#>
#>     some
outlierTest(m)
#> No Studentized residuals with Bonferonni p < 0.05
#> Largest |rstudent|:
#>   rstudent unadjusted p-value Bonferonni p
#> 2     2.27             0.0319        0.956
length(x1)
#> [1] 30
length(x2)
#> [1] 30
length(y)
#> [1] 30

m <- lm(y ~ x1 + x2)
par(mfrow = (c(2, 2))) # this gives us a 2x2 plot
plot(m)
load(file = './data/bad.rdata')
m <- lm(y2 ~ x3 + x4)
par(mfrow = (c(2, 2)))      # this gives us a 2x2 plot
plot(m)
outlierTest(m)
#>    rstudent unadjusted p-value Bonferonni p
#> 28     4.46           7.76e-05       0.0031
influence.measures(m)
influence.measures(m)
#> Influence measures of
#>   lm(formula = y2 ~ x3 + x4) :
#>
#>      dfb.1_   dfb.x3   dfb.x4    dffit cov.r   cook.d    hat inf
#> 1  -0.18784  0.15174  0.07081 -0.22344 1.059 1.67e-02 0.0506
#> 2   0.27637 -0.04367 -0.39042  0.45416 1.027 6.71e-02 0.0964
#> 3  -0.01775 -0.02786  0.01088 -0.03876 1.175 5.15e-04 0.0772
#> 4   0.15922 -0.14322  0.25615  0.35766 1.133 4.27e-02 0.1156
#> 5  -0.10537  0.00814 -0.06368 -0.13175 1.078 5.87e-03 0.0335
#> 6   0.16942  0.07465  0.42467  0.48572 1.034 7.66e-02 0.1062
etc ...
library(lmtest)
m <- lm(y ~ x)           # Create a model object
dwtest(m)                # Test the model residuals
acf(m)                   # Plot the ACF of the model residuals
library(lmtest)
#> Loading required package: zoo
#>
#> Attaching package: 'zoo'
#> The following objects are masked from 'package:base':
#>
#>     as.Date, as.Date.numeric
load(file = './data/ac.rdata')
m <- lm(y1 ~ x)
dwtest(m)
#>
#>  Durbin-Watson test
#>
#> data:  m
#> DW = 2, p-value = 0.4
#> alternative hypothesis: true autocorrelation is greater than 0
m <- lm(y2 ~ x)
dwtest(m)
#>
#>  Durbin-Watson test
#>
#> data:  m
#> DW = 2, p-value = 0.01
#> alternative hypothesis: true autocorrelation is greater than 0
dwtest(m, alternative = "two.sided")
load(file = './data/pred2.rdata')

m <- lm(y ~ u + v + w)
preds <- data.frame(u = 3.1, v = 4.0, w = 5.5)
predict(m, newdata = preds)
#>  1
#> 45
preds <- data.frame(
  u = c(3.0, 3.1, 3.2, 3.3),
  v = c(3.9, 4.0, 4.1, 4.2),
  w = c(5.3, 5.5, 5.7, 5.9)
)
predict(m, newdata = preds)
#>    1    2    3    4
#> 43.8 45.0 46.3 47.5
predict(m, newdata = preds, interval = "prediction")
predict(m, newdata = preds, interval = "prediction")
#>    fit  lwr  upr
#> 1 43.8 38.2 49.4
#> 2 45.0 39.4 50.7
#> 3 46.3 40.6 51.9
#> 4 47.5 41.8 53.2
oneway.test(x ~ f)
load(file = './data/anova.rdata')
oneway.test(r ~ mon, data = GSPC_df)
#>
#>  One-way analysis of means (not assuming equal variances)
#>
#> data:  r and mon
#> F = 2, num df = 10, denom df = 7000, p-value = 0.03
oneway.test(r ~ mon, data = GSPC_df, subset = tail(seq_along(r), 2500))
#>
#>  One-way analysis of means (not assuming equal variances)
#>
#> data:  r and mon
#> F = 0.7, num df = 10, denom df = 1000, p-value = 0.8
oneway.test(x ~ f, var.equal = TRUE)
m <- aov(x ~ f)
summary(m)
interaction.plot(pred1, pred2, resp)
library(faraway)
data(rats)
interaction.plot(rats$poison, rats$treat, rats$time)
m <- aov(x ~ f)
TukeyHSD(m)
plot(TukeyHSD(m))
load(file = './data/anova.rdata')
oneway.test(r ~ wday, subset = 1:2500, data = GSPC_df)
#>
#>  One-way analysis of means (not assuming equal variances)
#>
#> data:  r and wday
#> F = 10, num df = 4, denom df = 1000, p-value = 5e-10
m <- aov(r ~ wday, subset = 1:2500, data = GSPC_df)
TukeyHSD(m)
#>   Tukey multiple comparisons of means
#>     95% family-wise confidence level
#>
#> Fit: aov(formula = r ~ wday, data = GSPC_df, subset = 1:2500)
#>
#> $wday
#>              diff       lwr       upr p adj
#> Mon-Fri -0.003153 -4.40e-03 -0.001911 0.000
#> Thu-Fri -0.000934 -2.17e-03  0.000304 0.238
#> Tue-Fri -0.001855 -3.09e-03 -0.000618 0.000
#> Wed-Fri -0.000783 -2.01e-03  0.000448 0.412
#> Thu-Mon  0.002219  9.79e-04  0.003460 0.000
#> Tue-Mon  0.001299  5.85e-05  0.002538 0.035
#> Wed-Mon  0.002370  1.14e-03  0.003605 0.000
#> Tue-Thu -0.000921 -2.16e-03  0.000314 0.249
#> Wed-Thu  0.000151 -1.08e-03  0.001380 0.997
#> Wed-Tue  0.001072 -1.57e-04  0.002300 0.121
plot(TukeyHSD(m))
kruskal.test(x ~ f)
load(file = './data/student_data.rdata')
head(student_data)
#> # A tibble: 6 x 4
#>   att.fact hw.mean midterm hw
#>   <fct>      <dbl>   <dbl> <fct>
#> 1 3          0.808   0.818 4
#> 2 3          0.830   0.682 4
#> 3 3          0.444   0.511 2
#> 4 3          0.663   0.670 3
#> 5 2          0.9     0.682 4
#> 6 3          0.948   0.954 4
kruskal.test(midterm ~ hw, data = student_data)
#>
#>  Kruskal-Wallis rank sum test
#>
#> data:  midterm by hw
#> Kruskal-Wallis chi-squared = 30, df = 4, p-value = 4e-05
anova(m1, m2)
load(file = './data/anova2.rdata')
m1 <- lm(y ~ u)
m2 <- lm(y ~ u + v)
m3 <- lm(y ~ u + v + w)
anova(m1, m2)
#> Analysis of Variance Table
#>
#> Model 1: y ~ u
#> Model 2: y ~ u + v
#>   Res.Df RSS Df Sum of Sq    F Pr(>F)
#> 1     18 197
#> 2     17 130  1      66.4 8.67 0.0091 **
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(m2, m3)
#> Analysis of Variance Table
#>
#> Model 1: y ~ u + v
#> Model 2: y ~ u + v + w
#>   Res.Df RSS Df Sum of Sq    F Pr(>F)
#> 1     17 130
#> 2     16 103  1      27.5 4.27  0.055 .
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Distribution	Package name
Ubuntu or Debian	r-base
Red Hat or Fedora	R.i386
Suse	R-base

Labeled key	Ctrl-key combination	Effect
Up arrow	Ctrl-P	Recall previous command by moving backward through the history of commands.
Down arrow	Ctrl-N	Move forward through the history of commands.
Backspace	Ctrl-H	Delete the character to the left of cursor.
Delete (Del)	Ctrl-D	Delete the character to the right of cursor.
Home	Ctrl-A	Move cursor to the start of the line.
End	Ctrl-E	Move cursor to the end of the line.
Right arrow	Ctrl-F	Move cursor right (forward) one character.
Left arrow	Ctrl-B	Move cursor left (back) one character.
	Ctrl-K	Delete everything from the cursor position to the end of the line.
	Ctrl-U	Clear the whole darn line and start over.
Tab		Name completion (on some platforms).

Operator	Meaning	See also
`[ [[`	Indexing	“Selecting Vector Elements”
`:: :::`	Access variables in a name space (environment)
`$ @`	Component extraction, slot extraction
`^`	Exponentiation (right to left)
`- +`	Unary minus and plus
`:`	Sequence creation	Recipes pass:[<a data-type="xref” data-xrefstyle="select:labelnumber” href="#recipe-id021">#recipe-id021</a>, <a data-type="xref” data-xrefstyle="select:labelnumber” href="#recipe-id047">#recipe-id047</a>
`%` any `%` (includin	g `%>%`) Special operators	Discussion
`* /`	Multiplication, division	Discussion
`+ -`	Addition, subtraction
`== != < > <= >=`	Comparison	“Comparing Vectors”
`!`	Logical negation
`& &&`	Logical “and”, short-circuit “and”
`
`	Logical “or”, short-circuit “or”
`~`	Formula	“Performing Simple Linear Regression”
`-> ->>`	Rightward assignment	“Setting Variables”
`=`	Assignment (right to left)	“Setting Variables”
`<- <<-`	Assignment (right to left)	“Setting Variables”
`?`	Help	“Getting Help on a Function”

Object	Example	Mode
Number	`3.1415`	numeric
Vector of numbers	`c(2.7.182, 3.1415)`	numeric
Character string	`"Moe"`	character
Vector of character strings	`c("Moe", "Larry", "Curly")`	character
Factor	`factor(c("NY", "CA", "IL"))`	numeric
List	`list("Moe", "Larry", "Curly")`	list
Data frame	`data.frame(x=1:3, y=c("NY", "CA", "IL"))`	list
Function	`print`	function

Conversion	How	Notes
Vector→List	`as.list(vec)`	Don’t use `list(vec)`; that creates a 1-element list whose only element is a copy of `vec`.
Vector→Matrix	To create a 1-column matrix: `cbind(vec)` or `as.matrix(vec)`	See “Initializing a Matrix”.
To create a 1-row matrix: `rbind(vec)`
To create an n × m matrix: `matrix(vec,n,m)`
Vector→Data frame	To create a 1-column data frame: `as.data.frame(vec)`
To create a 1-row data frame: `as.data.frame(rbind(vec))`
List→Vector	`unlist(lst)`	Use `unlist` rather than `as.vector`; see Note 1 and “Flatten a List into a Vector”.
List→Matrix	To create a 1-column matrix: `as.matrix(lst)`
To create a 1-row matrix: `as.matrix(rbind(lst))`
To create an n × m matrix: `matrix(lst,n,m)`
List→Data frame	If the list elements are columns of data: `as.data.frame(lst)`
If the list elements are rows of data: “Initializing a Data Frame from Row Data”
Matrix→Vector	`as.vector(mat)`	Returns all matrix elements in a vector.
Matrix→List	`as.list(mat)`	Returns all matrix elements in a list.
Matrix→Data frame	`as.data.frame(mat)`
Data frame→Vector	To convert a 1-row data frame: `df[1,]`	See Note 2.
To convert a 1-column data frame: `df[,1]` or `df[[1]]`
Data frame→List	`as.list(df)`	See Note 3.
Data frame→Matrix	`as.matrix(df)`	See Note 4.

Table 8-1. Common Discrete Distributions
Discrete distribution	R name	Parameters
Binomial	binom	n = number of trials; p = probability of success for one trial
Geometric	geom	p = probability of success for one trial
Hypergeometric	hyper	m = number of white balls in urn; n = number of black balls in urn; k = number of balls drawn from urn
Negative binomial (NegBinomial)	nbinom	size = number of successful trials; either prob = probability of successful trial or mu = mean
Poisson	pois	lambda = mean

Table 8-2. Common Continuous Distributions
Continuous distribution	R name	Parameters
Beta	beta	shape1; shape2
Cauchy	cauchy	location; scale
Chi-squared (Chisquare)	chisq	df = degrees of freedom
Exponential	exp	rate
F	f	df1 and df2 = degrees of freedom
Gamma	gamma	rate; either rate or scale
Log-normal (Lognormal)	lnorm	meanlog = mean on logarithmic scale;
		sdlog = standard deviation on logarithmic scale
Logistic	logis	location; scale
Normal	norm	mean; sd = standard deviation
Student’s t (TDist)	t	df = degrees of freedom
Uniform	unif	min = lower limit; max = upper limit
Weibull	weibull	shape; scale
Wilcoxon	wilcox	m = number of observations in first sample;
		n = number of observations in second sample

Table 8-3. (#tab:distributions) Discrete Distributions
Distribution	Density function: P(X = x)	Distribution function: P(X ≤ x)
Binomial	dbinom(x, size, prob)	pbinom(x, size, prob)
Geometric	dgeom(x, prob)	pgeom(x, prob)
Poisson	dpois(x, lambda)	ppois(x, lambda)

Table 8-4. Continuous Distributions
Distribution	Distribution function: P(X ≤ x)
Normal	pnorm(x, mean, sd)
Student’s t	pt(x, df)
Exponential	pexp(x, rate)
Gamma	pgamma(x, shape, rate)
Chi-squared (χ²)	pchisq(x, df)

R Cookbook

R Cookbook

Revision History for the Second Edition

Chapter 1. Getting Started and Getting Help

Introduction

Downloading and Installing R

Problem

Solution

Discussion

See Also

Installing R Studio

Problem

Solution

Discussion

Starting R Studio

Problem

Solution

Discussion

Figure 1-1. R and R Studio icons in OSX

Figure 1-2. The R Console in OSX

Entering Commands

Problem

Solution

Discussion

See Also

Exiting from R Studio

Problem

Solution

Discussion

Figure 1-3. Save Workspace Options

See Also

Interrupting R

Problem

Solution

Discussion

See Also

Viewing the Supplied Documentation

Problem

Solution

Discussion

Figure 1-4. R Studio help.start

See Also

Getting Help on a Function

Problem

Solution

Discussion

Figure 1-5. R Studio Tooltip

See Also

Searching the Supplied Documentation

Problem

Solution

Discussion

See Also

Getting Help on a Package

Problem

Solution

Discussion

See Also

Searching the Web for Help

Problem

Solution

Discussion

Figure 1-6. RSeek

See Also

Finding Relevant Functions and Packages

Problem

Solution

Discussion

See Also

Searching the Mailing Lists

Problem

Solution

Discussion

See Also

Submitting Questions to Stack Overflow or Elsewhere in the Community

Problem

Solution

Discussion

See Also

Chapter 2. Some Basics