******************** 
* PART 1 - GENERIC *
********************

### Loading the data frame we will work on ###

tviti=read.csv("tviti.1000.csv", header=TRUE)

-- Use head(tviti) to check the data

### Creating crosstabs to work on ###

L.by.gender = matrix(c(782005, 268733, 100562, 1748410, 506605, 144350),ncol=3,byrow=TRUE) 	# step 1 of crosstabulating the total number of tweets by gender and language standardness (data from file tviti.freq.csv)
colnames(L.by.gender) = c("L1","L2","L3") 	# column names are added separately
rownames(L.by.gender) = c("female","male") 	# row names are added separately
L.by.gender = as.table(L.by.gender)	# format adjustment

L.by.gender.p = prop.table(L.by.gender,1) 	# turning the values into relative frequencies (proportions), based on table rows 

L.by.gender.1000 = xtabs(~tviti$gender+tviti$standard_ling, exclude="neutral") 		# # obtaining and crosstabulating frequencies in the tweet sample (tviti), excluding the "neutral" values

L.by.gender.1000.2 = xtabs(~gender+standard_ling, exclude="neutral", data=tviti) 	# "data=" is used instead of tviti$ before column names	 

L.by.gender.1000.p = prop.table(L.by.gender.1000,1)

-- Create similar tables for sentiment by gender, and for linguistic by technical standardness, for all data (see tviti.freq.csv) and/or for sample data (tviti) 


************************** 
* PART 2a - DESCRIPTIVES *
**************************

### Frequencies other than crosstabs ###

table(tviti$gender) 	# show total number of observations by X (in this case by gender)

table(tviti$gender == "female") 	# show total number of observations for X= (value shown under TRUE, also returns the rest under FALSE)

table(tviti$gender == "female" & tviti$sentiment == "positive") 	# show total number of observations for X= and Y= 

table(tviti$gender == "female" & tviti$sentiment != "positive") 	# show total number of observations for X different from "Z"

-- Find the numbers of tweets scored L1, L2 and L3 on linguistic standardness; find the numbers of tweets by female users scored T1, T2 and T3 on technical standardess

### Desriptive measures ###

mean(tviti$word)
sd(tviti$word) 
var(tviti$word)

median(tviti$word)
quantile(tviti$word)
min(tviti$word)
max(tviti$word)

summary(tviti$word)

-- Calculate the mean and standard deviation for the number of tokens, number of characters and number of punctuation symbols; calculate the median for the same data 

mean(tviti$word[tviti$gender == "female"]) 	# referring to a portion of the data

-- Calculate the average number of emoticons in tweets by male and female users, and in tweets by corporate and private users

### Repeating operations over multiple sets of data ###

# special functions allow you to use a single command for doing the same operation over several (sub)sets of data; the most useful function for data frames is tapply

tapply(tviti$word, tviti$gender, mean)  	# calculates the mean (for number of words) by variable level (gender)

tapply(tviti$word[tviti$sentiment == "positive"], tviti$gender[tviti$sentiment == "positive"], mean) 	# the same with subsets of data

-- Use tapply to calculate the mean linguistic standardness score (standard_ling_n) by sentiment alone, and by sentiment but only for corporate users


******************** 
* PART 2b - GRAPHS *
********************

### Some basic graphs ###

plot(tviti$words,tviti$tokens) 	# produce a scatterplot; plot() will give different graphs depending on the data format

plot(tviti$standard_tech_n,tviti$standard_ling_n)

hist(tviti$words) 		# produce a histogram

hist(tviti$chars)

boxplot(tviti$standard_tech_n~tviti$gender) 	# produce a boxplot

mosaicplot(L.by.gender) 	# produce a mosaic plot

mosaicplot(L.by.gender.1000)

-- Plot number of words vs. number of characters; do a histogram of linguistic standardness level; do a boxplot of linguistic standardness level by source; do a mosaic plot of sentiment by gender

### Some generic graph options ###

par() 	# see current graph settings

par(col.lab="red") 	# changing axis label colour; global change of a default parameter value, will be in effect for the rest of the session or until changed again

hist(tviti$words, col.lab="red") # applying the same change to a single graph within the drawing function

main="title" 	# chart title

xlab="X-axis label" 	# X axis label
ylab="y-axix label" 	# Y axis label

xlim=c(xmin, xmax) 	# min and max value shown on X axis
ylim=c(ymin, ymax) 	# min and max value shown on Y axis

col=c("orchid","turquoise") 	# colour definition for two bars, two boxes, etc.; list as many colours as there are bars, boxes, etc., the colours will start repeating themselves if you do not specify enough values; detailed colour chart with colour names: http://research.stowers-institute.org/efg/R/Color/Chart/ColorChart.pdf

pch=0,1,2,3... 	# choose plotting symbol, e.g. 0=square, 1=circle (default); more values can be seen e.g. here http://www.statmethods.net/advgraphs/parameters.html

lty=0,1,2,3... 	# choose line type, 0=blank, 1=solid (default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash

# the above parameters can be defined within the plotting functions; examples can be seen below, where some additional parameters appear as well

### Plot ###

plot(tviti$words,tviti$tokens,col="turquoise",main="Words by tokens",pch=16) 	# as above, with some added features

plot(sort(tviti$word)) 	# plot values against their ranks, useful for seing how the data are distributed

-- Plot linguistic vs. technical standardness; plot sorted values for characters and linguistic standardness

### Boxplots ###

boxplot(tviti$standard_tech_n~tviti$gender)	

boxplot(tviti$standard_tech_n~tviti$gender,col=c("orchid","turquoise","cadetblue4")) 	# adding colour to the boxes

boxplot(tviti$standard_ling_n[tviti$gender=="female"],tviti$standard_ling_n[tviti$gender=="male"], main="Linguistic standardness by gender", ylab="Linguistic standardness score",col=c("orchid","turquoise","cadetblue4"), xlab="Gender", xaxt = "n", notch=T); grid(); axis(1, at=1:2, labels=c("female", "male")) # specifying additional parameters; xaxt="n" suppresses the default X axis, which is replaced by an axis defined through axis(); notch=T creates a notch, and grid() adds a grid

-- Draw a boxplot to compare the number of retweets for tweets with positive and negative sentiment; compare the number of emoticons in private vs. corporate tweets

### Mosaic plots ###

mosaicplot(L.by.gender.p, col=c("turquoise", "rosybrown4", "orchid"))

mosaicplot(L.by.gender.1000.p, col=c("turquoise", "rosybrown4", "orchid"), main="Linguistic standardness by gender", xlab="Gender", ylab="Linguistic standardness score")

-- Using the crosstabs you created before, draw a mosaic plot for sentiment by gender

### Histograms ###

hist(tviti$words, col = "lightblue", xlab = "Number of words", main = "Number of words per tweet")

-- Draw histograms for linguistic standardness and technical standardness 

### Barplots ###

barplot(as.matrix(L.by.gender),beside=TRUE) 	# matrix is yet another object type in R; beside=TRUE groups columns (instead of stacking them)

barplot(as.matrix(L.by.gender),beside=TRUE,main="Standardness by gender", xlab="Standardness level", col=c("paleturquoise1","orangered"))

barplot(as.matrix(L.by.gender),beside=TRUE,main="Standardness by gender", ylim=c(0,2000000), xlab="Standardness level", col=c("paleturquoise1","orangered")) # defining the span of Y axis values
legend("topright", c("female","male"), bty="n", fill=c("paleturquoise1","orangered")) 	# adding a legend

-- Create a barplot for standardness by gender

### Line graphs ###

plot(L.by.gender.p[1,]*100, type="o",ylim=c(0,100),ann=FALSE,xaxt='n')	 # plotting the values for 'female'; type="o" means both point and lines; see other options at http://www.statmethods.net/graphs/line.html; proportions are turned into percentages (*100) for easier viewing; [1,] denotates 1st row, [,1] would be 1st column, [1,1] the top left data point
lines(L.by.gender.p[2,]*100, type="o", pch=22, lty=2) 	# adding the values for 'male'; lines() defines options, it cannot produce a graph on its own
axis(1, at=1:3, labels=c("L1","L2","L3")) 	# labelling X axis
legend("topright", c("female","male"), lty=1:2, lwd=2, pch=c(1,22)) 	# adding the legend

plot(sentiment.by.gender[1,], type="o",ylim=c(200000,1500000),ann=FALSE,xaxt='n')
lines(sentiment.by.gender[2,], type="o", pch=22, lty=2)
axis(1, at=1:3, labels=c("negative","neutral","positive"))
legend("topright", c("female","male"), lty=1:2, lwd=2, pch=c(1,22))

sentiment.by.gender.p = prop.table(sentiment.by.gender,1)
plot(sentiment.by.gender.p[1,]*100, type="o",ylim=c(0,100),ann=FALSE,xaxt='n')
lines(sentiment.by.gender.p[2,]*100, type="o", pch=22, lty=2)
axis(1, at=1:3, labels=c("negative","neutral","positive"))
legend("topright", c("female","male"), lty=1:2, lwd=2, pch=c(1,22))

-- Create a line graph for standardness by gender using percentages (or you can try absolute frequencies)

******************* 
* PART 3 -  TESTS *
*******************

### Chi-sqare test(s) ###

chi1=c(506,2930)
names(chi1)=c("tutorial","workshop")
chisq.test(chi1, p=c(0.5, 0.5)) 	# test whose results are reported on slide 2; goodness-of-fit test, compares two frequencies without looking at any independent variables that might influence them; probabilities expected under the null hypothesis (or some other criterion) must be provided (1/2 is also fine instead of 0.5)

chi1.results = chisq.test(chi1, p=c(0.5, 0.5)) 	# tests can also be saved as objects

chi1.results 	# shows the results

str(chi1.results) 	# shows more detailed info about the results

chisq.test(L.by.gender) 	# if crosstabs are invoked, chi.test does a test of independence between two categorical variables; this is a very frquent test in corpus linguistics

summary(L.by.gender) 	# if the data is in crosstabs table format, summary will also provide the results of the chi-square test

L.by.T.standardness.1000=table(tviti$standard_ling, tviti$standard_tech)
chisq.test(L.by.T.standardness.1000)
summary(L.by.T.standardness.1000)

-- Do chi-square tests to see if sentiment and gender and linguistic standardness and technical standardness are related or independent (use either tviti or tviti.freq.csv) 

### Correlation ###

cor(tviti$words, tviti$tokens, method="spearman") 	# obtain correlation coefficient only; when parametric assumptions are fulfilled use method="pearson"

cor.test(tviti$words, tviti$tokens, method="spearman") 	# obtain additional information such as p value

cor.test(tviti$standard_tech_n, tviti$standard_ling_n, method="spearman") 	# compare to chisq.test(L.by.T.standardness.1000) above

cor.test(tviti$standard_ling_n, tviti$punct, method="spearman") 	# compare to kruskal.test(tviti$punct ~ tviti$standard_ling) below

-- Test the correlation between linguistic standardness and number of emoticons, between technical standardness and length in characters, and between linguistic standardness and number of commas

### Wilcoxon rank sum and t-test ###			

wilcox.test(tviti$words ~ tviti$source)

wilcox.test(tviti$words[tviti$standard_ling=="L2"|tviti$standard_ling=="L3"] ~ tviti$standard_ling[tviti$standard_ling=="L2"|tviti$standard_ling=="L3"]) # choosing two levels of a three-level variable using or (|)

t.test(tviti$words ~ tviti$source) 	# parametric assumptions need to be met 

- Compare the mean/median number of words by gender; compare the mean/median number of emoticons by source

### Kruskal-Wallis test and one-way ANOVA ###		

kruskal.test(tviti$punct ~ tviti$standard_ling) 	# nonparametric

oneway.test(tviti$punct ~ tviti$standard_ling) 	# parametric

- Compare the mean/median number of emoticons by sentiment and by linguistic and technical standardness level 

### Normality check - Shapiro-Wilk test ### 		

shapiro.test(tviti$words) 	# if the result is significant, the distribution is NOT normal, nonparametric test needed 

tapply(tviti$words,tviti$standard_ling,shapiro.test) 	# applying the test to several data subsets (here to the three levels of linguistic standardness)

-- Check normality for number of characters, overall and by technical standardness  

### Equality of dispersion check - Ansari-Bradley test ###

ansari.test(tviti$words, tviti$tokens)		# a non-parametric tests of dispersions in two datasets; if significant, the dispersions are NOT equal

ansari.test(tviti$words[tviti$gender=="female"], tviti$words[tviti$gender=="male"])

-- Check whether the number of words has similar dispersions in corporate and private tweets


******************************** 
* RECAP FROM THE INTRO HANDOUT *
********************************

> 	# command prompt
+ 	# prompt for command continuation

help(function.name) or ?function.name 	# get help on a function

getwd() 		# see where your working directory is
ls() or objects() 	# list the current contents of the workspace
search() 	# see currently loaded packages (as well as attached objects, see below)

library() 	# see installed packages
install.packages("package.name") 	# install a package
library(package.name) 	# load a package
chooseCRANmirror() 	# choose an installation mirror
help(package="package.name") 	# get help on a package
data(data.object)	 # load an example set from a package

= <- -> 	# assignment operators for data objects

c() 	# concatenate a set of values separated by a comma; text has to be in quotation marks

data.frame(col.a=c(1,1,2,3), col.b=c(2,4,6,8)) -> somedata 	# create a data frame called somedata from vectors labelled col.a and col.b
somedata$col.b 	# refer to a specific column in a data frame
somedata$col.b[somedata$col.a==1] 	# refer to specific values in a column in a data frame

read.table("tablename.txt", sep = ",", header=TRUE) 	# import a table with a header; separator specified as comma, other options available
read.csv("tablename.csv", header=TRUE) 	# import a table with with comma-separated fields with a header
read.delim("tablename.txt", header=TRUE) 	# import a table with tab-delimited fields with a header

head(somedata) 	# show the first rows of the data object 
summary(somedata) 	# list basic info about the data object

options(OutDec= ",") 	# see output with commas as decimal marks

quit() or q() 	# exit R


********************** 
* OTHER USEFUL STUFF *
**********************

### Main mathematical operators ###
+ - * /
== > < >= <=

### Main logical operators ###

& 	# and
| 	# or
! 	# negation (e.g. != means 'is not'; note the single rather than double =)

### Varia ###

head(somdata,2) 	# define the number of rows shown, default is 6
tail(somedata) 	# show the 6 bottom rows of the data object (or set a number as above)

sort(somedata) 	# sort data in a vector

colnames(somedata) 	# list column names

class(somedata) 	# find the class object X belongs to (vector, data.frame...)

read.csv2("tablename.csv", header=TRUE) 	# import a table with fields separated by semi-colons (;), and comma as the decimal mark
read.delim2("tablename.csv", header=TRUE) 	# import a table with tab-delimited fields and comma as the decimal mark