#Basic calculations
1+1
2*2
#Logical operators
1>2
1<2
1&0
1|0
#Creating a vector
n = c(2, 3, 5,6,7)
s = c("aa", "bb", "cc", "dd", "ee")
s
s = c( "bb", 1)
s
a=c(n,s)
#Creating a list
x = list(n, s)
#Creating a matrix
as.matrix(n)
as.matrix(s)
#as.matrix(c(n,s),nrow=5,ncol=2,byrow=TRUE)
help(as.matrix)
matrix(c(1,2,3, "a","b","c"), nrow = 3, ncol = 2)
matrix(c(n,s), nrow = 5, ncol = 2)
# Importing datasets
help(read)
t=read.csv("D:/in-class/Titanic.csv")
t2=read.table("D:/in-class/credit_default.txt")
#In case there is an issue with importing
dataset
, consider specifying quote=, as below
t3=read.csv("D:/in-class/product_search.csv",quote="\"")
# Structure of dataset
t=read.csv("D:/in-class/Titanic.csv")
str(t) # gives us quite a few information about the dataset
class(t) # typically, the imported datasets as always imported as data.frame objects
dim(t) # dimension of an abject (data.frame)
nrow(t) # number of rows
ncol(t) # number of columns
colnames(t) # specify column names
class(t$Survived) # class of a variable
head(t) # gives us the first few rows of a dataset
t[1,1] # gives us the value in first row & first column. Note the syntax: [rows,columns]
t[1:3,1] # gives us the values in first 3 rows of 1st column
t[c(1,100,500),]
t[1:3,] # gives us the values in first 3 rows of all columns (note that when we dont specify the filtering condition in column index, the result includes all columns)
t$Survived[1:3]# gives us the values of first 3 rows of "Survived" variable
t[c(1,4),-c(1,3)] # c() is a function to get a combination of values. c(1,4) gives us the first & 4th row. -c(1) excludes the first column
t[c(1,4),c("Survived","Fare")] # c() is a function to get a
combination
of values. c(1,4) gives us the first & 4th row. -c(1) excludes the first column
t[1:3,"Survived"]
# Data manipulation
t$PassengerId=NULL
summary(t)
t$unknown_age=0
t$unknown_age=ifelse(is.na(t$Age),1,0) # the syntax here is, ifelse(condition, value if condition is true, value if condition is false)
# is.na() function helps us in identifying if there any NA values in dataset. Be sure to remove or impute (replace) NA values within a dataset
unique(t$Embarked) # gives all the unique values within a variable
table(t$Embarked) # table gives a count of all the unique values in dataset
mean(t$Age) # mean as a function
sum(t$Age)
mean(t$Age,na.rm=TRUE) # na.rm=TRUE helps in removing the missing values, if they exist
t$Age2=ifelse(t$unknown_age==1,mean(t$Age,na.rm=TRUE),t$Age) # one initializes a new variable within a dataset by using the $ operator & the new variable name
summary(t) # summary is typically the first step performed after importing dataset
order(t$Age2) # order function is used to sort dataset. It gives out the index of rows that a value is sorted to
t=t[order(t$Age2),] # t is sorted based on the order of age
mean(t$Survived[1:50])
mean(t$Survived[(nrow(t)-50):nrow(t)])
t2=t[1:50,]
t_male=t[t$Sex=="male",] # one can filter for criterion by specifying the variable name with a == and the value it is to be filtered to
t_female=t[t$Sex=="female",]
mean(t_male$Survived)
mean(t_female$Survived)
t2=t[t$Sex=="male" & t$Age2<10,]
mean(t2$Survived)
t2=t[t$Sex=="female" | t$Age2<10,]
mean(t2$Survived)
install.packages("sqldf")
library(sqldf)
t3=sqldf("select sex,avg(survived) from t group by sex")
# SQL like filtering or aggregation can be done using the sqldf function
t$age3=ifelse(t$Age2<10,1,0)
t2=t[,c("Age2","Sex","Survived")] # one can filter for the
columns
of interest y specifying the c() function with the variables that are needed
help(aggregate)
aggregate(t$Survived,by=list(t$Sex,t$Pclass),sum)
# aggregate function works similar to sqldf where grouping operations can be done
#seq function is used to generate numbers by a given step size. seq(0,1,0.2) gives us c(0,0.2,0.4,0.6,0.8,1)
#quantile gives us the values at the various percentiles specified
help(quantile)
summary(t)
t$age3=as.character(t$Age) # as.character function converts a value into a character variablle
quantile(t$Age2,probs=seq(0,0.5,0.1))[2] # gives us the second value in the output of quantile function
x=quantile(t$Age2,probs=seq(0,1,0.1))[2]
t2=t[t$Age2<x,]
mean(t2$Survived)
t$less_than_10=ifelse(t$Age2<x,1,0)
aggregate(t$Survived,by=list(t$Sex,t$less_than_10),mean) # aggregation can be done over multiple variables by using c() function
t2=t[!t$Age2<x,] # ! is used as a engation statement
mean(t2$Survived)
# Loops
t=read.table("D:/in-class/credit_default.txt")
for(i in 1:3){
print(i)
}
summary(t)
# a good idea is to note the difference between mean & median
values
of variables
mean(t$DebtRatio)
median(t$DebtRatio)
t2=t
# it's a good practcie to test out the code of for loop before looping it through, by assigning a certain value of i & test out the for loop code
i=2
t2[,i]=ifelse(is.na(t2[,i]),median(t2[,i],na.rm=TRUE),t2[,i])
t2[,i]=ifelse(t2[,i]<median(t2[,i],na.rm=TRUE),"Low","High")
print(aggregate(t2$SeriousDlqin2yrs,by=list(t2[,i]),mean))
for(i in 1:ncol(t2)){
t2[,i]=ifelse(is.na(t2[,i]),median(t2[,i],na.rm=TRUE),t2[,i])
}
# below is an exercise where we are imputing missing value with median values & then flagging variables as high when the value is above median value & low when the value is below median value
for(i in 2:ncol(t2)){
t2[,i]=ifelse(is.na(t2[,i]),median(t2[,i],na.rm=TRUE),t2[,i])
t2[,i]=ifelse(t2[,i]<median(t2[,i],na.rm=TRUE),"Low","High")
print(colnames(t2[i]));
print(aggregate(t2$SeriousDlqin2yrs,by=list(t2[,i]),mean))
}
df=data.frame(group=c("a","b"),avg=c(2,2))
#joins
search=read.csv("D:/in-class/product_search.csv",quote="\"")
descriptions=read.csv("D:/in-class/product_descriptions.csv",quote="\"")
summary(search)
colnames(search)
colnames(descriptions)
help(merge)
# in a typical merge function, we have to specify the x (first) table, the y (second) table to which we are joining the first table
# we would also have to specify the variable based on which we are joining the datasets using the "by" parameter
# in case the column name of by parameter is different in datasets, we can use by.x & by.y
# by default merge does an inner join (inner join is when only the values that are common in both tables are joined)
# all.x=TRUE helps us do a left join (left join is when all the values in x table are retained even if some of them do not have a match in the second table)
# all.y = TRUE does a right join where all the values in right (second) table are retained
# assume that x table has productid as (1,2,3) & right (y) table has productid as (1,5,6)
# inner join of these two tables gives us the values of only productid =1 (as it is the only one in common)
# left join gives us the
information
of (1,2,3) however, the info of pid 1 will be full and info of pid 2,3 would be blank, as right table does not have info about these 2 pids
# right join givs the information of (1,5,6) where we have pid 1 info completely and info of 5,6 is missing
search_descriptions=merge(search,descriptions,by="product_uid",all.x=TRUE)
search_descriptions1=merge(search,descriptions,by="product_uid",all.y=TRUE)
search_descriptions2=merge(descriptions,search,by="product_uid",all.x=TRUE)
nrow(search_descriptions)
nrow(search_descriptions1)
nrow(search_descriptions2)
search_descriptions2$missing_id=ifelse(is.na(search_descriptions2$id),1,0)
sum(search_descriptions2$missing_id)
x=search_descriptions2[search_descriptions2$missing_id==0,]
length(unique(x$Product_uid))
system.time(search_descriptions<-merge(search,descriptions,by="product_uid",all.x=TRUE))
# note the difference in speed between base "merge" statement & fread/ data.table "merge" statment
install.packages("data.table")
library(data.table)
search=fread("D:/in-class/product_search.csv")
descriptions=fread("D:/in-class/product_descriptions.csv")
system.time(descriptions<-read.csv("D:/in-class/product_descriptions.csv"))
system.time(descriptions<-fread("D:/in-class/product_descriptions.csv"))
write.csv(search,"D:/in-class/search_output.csv",row.names=FALSE)
search_descriptions=merge(search,descriptions,by="product_uid",all.x=TRUE)
system.time(search_descriptions<-merge(search,descriptions,by="product_uid",all.x=TRUE))
# writing custom functions
square = function(x) {x*x}
square(13.5)
square("two")
addition = function(x,y) {x+y}
tt=as.data.frame(quantile(t$Age2,probs=seq(0,1,0.1)))