******************** * PART 1 - GENERIC * ******************** ### Loading the data frame we will work on ### tviti=read.csv("tviti.1000.csv", header=TRUE) -- Use head(tviti) to check the data ### Creating crosstabs to work on ### L.by.gender = matrix(c(782005, 268733, 100562, 1748410, 506605, 144350),ncol=3,byrow=TRUE) # step 1 of crosstabulating the total number of tweets by gender and language standardness (data from file tviti.freq.csv) colnames(L.by.gender) = c("L1","L2","L3") # column names are added separately rownames(L.by.gender) = c("female","male") # row names are added separately L.by.gender = as.table(L.by.gender) # format adjustment L.by.gender.p = prop.table(L.by.gender,1) # turning the values into relative frequencies (proportions), based on table rows L.by.gender.1000 = xtabs(~tviti$gender+tviti$standard_ling, exclude="neutral") # # obtaining and crosstabulating frequencies in the tweet sample (tviti), excluding the "neutral" values L.by.gender.1000.2 = xtabs(~gender+standard_ling, exclude="neutral", data=tviti) # "data=" is used instead of tviti$ before column names L.by.gender.1000.p = prop.table(L.by.gender.1000,1) -- Create similar tables for sentiment by gender, and for linguistic by technical standardness, for all data (see tviti.freq.csv) and/or for sample data (tviti) ************************** * PART 2a - DESCRIPTIVES * ************************** ### Frequencies other than crosstabs ### table(tviti$gender) # show total number of observations by X (in this case by gender) table(tviti$gender == "female") # show total number of observations for X= (value shown under TRUE, also returns the rest under FALSE) table(tviti$gender == "female" & tviti$sentiment == "positive") # show total number of observations for X= and Y= table(tviti$gender == "female" & tviti$sentiment != "positive") # show total number of observations for X different from "Z" -- Find the numbers of tweets scored L1, L2 and L3 on linguistic standardness; find the numbers of tweets by female users scored T1, T2 and T3 on technical standardess ### Desriptive measures ### mean(tviti$word) sd(tviti$word) var(tviti$word) median(tviti$word) quantile(tviti$word) min(tviti$word) max(tviti$word) summary(tviti$word) -- Calculate the mean and standard deviation for the number of tokens, number of characters and number of punctuation symbols; calculate the median for the same data mean(tviti$word[tviti$gender == "female"]) # referring to a portion of the data -- Calculate the average number of emoticons in tweets by male and female users, and in tweets by corporate and private users ### Repeating operations over multiple sets of data ### # special functions allow you to use a single command for doing the same operation over several (sub)sets of data; the most useful function for data frames is tapply tapply(tviti$word, tviti$gender, mean) # calculates the mean (for number of words) by variable level (gender) tapply(tviti$word[tviti$sentiment == "positive"], tviti$gender[tviti$sentiment == "positive"], mean) # the same with subsets of data -- Use tapply to calculate the mean linguistic standardness score (standard_ling_n) by sentiment alone, and by sentiment but only for corporate users ******************** * PART 2b - GRAPHS * ******************** ### Some basic graphs ### plot(tviti$words,tviti$tokens) # produce a scatterplot; plot() will give different graphs depending on the data format plot(tviti$standard_tech_n,tviti$standard_ling_n) hist(tviti$words) # produce a histogram hist(tviti$chars) boxplot(tviti$standard_tech_n~tviti$gender) # produce a boxplot mosaicplot(L.by.gender) # produce a mosaic plot mosaicplot(L.by.gender.1000) -- Plot number of words vs. number of characters; do a histogram of linguistic standardness level; do a boxplot of linguistic standardness level by source; do a mosaic plot of sentiment by gender ### Some generic graph options ### par() # see current graph settings par(col.lab="red") # changing axis label colour; global change of a default parameter value, will be in effect for the rest of the session or until changed again hist(tviti$words, col.lab="red") # applying the same change to a single graph within the drawing function main="title" # chart title xlab="X-axis label" # X axis label ylab="y-axix label" # Y axis label xlim=c(xmin, xmax) # min and max value shown on X axis ylim=c(ymin, ymax) # min and max value shown on Y axis col=c("orchid","turquoise") # colour definition for two bars, two boxes, etc.; list as many colours as there are bars, boxes, etc., the colours will start repeating themselves if you do not specify enough values; detailed colour chart with colour names: http://research.stowers-institute.org/efg/R/Color/Chart/ColorChart.pdf pch=0,1,2,3... # choose plotting symbol, e.g. 0=square, 1=circle (default); more values can be seen e.g. here http://www.statmethods.net/advgraphs/parameters.html lty=0,1,2,3... # choose line type, 0=blank, 1=solid (default), 2=dashed, 3=dotted, 4=dotdash, 5=longdash, 6=twodash # the above parameters can be defined within the plotting functions; examples can be seen below, where some additional parameters appear as well ### Plot ### plot(tviti$words,tviti$tokens,col="turquoise",main="Words by tokens",pch=16) # as above, with some added features plot(sort(tviti$word)) # plot values against their ranks, useful for seing how the data are distributed -- Plot linguistic vs. technical standardness; plot sorted values for characters and linguistic standardness ### Boxplots ### boxplot(tviti$standard_tech_n~tviti$gender) boxplot(tviti$standard_tech_n~tviti$gender,col=c("orchid","turquoise","cadetblue4")) # adding colour to the boxes boxplot(tviti$standard_ling_n[tviti$gender=="female"],tviti$standard_ling_n[tviti$gender=="male"], main="Linguistic standardness by gender", ylab="Linguistic standardness score",col=c("orchid","turquoise","cadetblue4"), xlab="Gender", xaxt = "n", notch=T); grid(); axis(1, at=1:2, labels=c("female", "male")) # specifying additional parameters; xaxt="n" suppresses the default X axis, which is replaced by an axis defined through axis(); notch=T creates a notch, and grid() adds a grid -- Draw a boxplot to compare the number of retweets for tweets with positive and negative sentiment; compare the number of emoticons in private vs. corporate tweets ### Mosaic plots ### mosaicplot(L.by.gender.p, col=c("turquoise", "rosybrown4", "orchid")) mosaicplot(L.by.gender.1000.p, col=c("turquoise", "rosybrown4", "orchid"), main="Linguistic standardness by gender", xlab="Gender", ylab="Linguistic standardness score") -- Using the crosstabs you created before, draw a mosaic plot for sentiment by gender ### Histograms ### hist(tviti$words, col = "lightblue", xlab = "Number of words", main = "Number of words per tweet") -- Draw histograms for linguistic standardness and technical standardness ### Barplots ### barplot(as.matrix(L.by.gender),beside=TRUE) # matrix is yet another object type in R; beside=TRUE groups columns (instead of stacking them) barplot(as.matrix(L.by.gender),beside=TRUE,main="Standardness by gender", xlab="Standardness level", col=c("paleturquoise1","orangered")) barplot(as.matrix(L.by.gender),beside=TRUE,main="Standardness by gender", ylim=c(0,2000000), xlab="Standardness level", col=c("paleturquoise1","orangered")) # defining the span of Y axis values legend("topright", c("female","male"), bty="n", fill=c("paleturquoise1","orangered")) # adding a legend -- Create a barplot for standardness by gender ### Line graphs ### plot(L.by.gender.p[1,]*100, type="o",ylim=c(0,100),ann=FALSE,xaxt='n') # plotting the values for 'female'; type="o" means both point and lines; see other options at http://www.statmethods.net/graphs/line.html; proportions are turned into percentages (*100) for easier viewing; [1,] denotates 1st row, [,1] would be 1st column, [1,1] the top left data point lines(L.by.gender.p[2,]*100, type="o", pch=22, lty=2) # adding the values for 'male'; lines() defines options, it cannot produce a graph on its own axis(1, at=1:3, labels=c("L1","L2","L3")) # labelling X axis legend("topright", c("female","male"), lty=1:2, lwd=2, pch=c(1,22)) # adding the legend plot(sentiment.by.gender[1,], type="o",ylim=c(200000,1500000),ann=FALSE,xaxt='n') lines(sentiment.by.gender[2,], type="o", pch=22, lty=2) axis(1, at=1:3, labels=c("negative","neutral","positive")) legend("topright", c("female","male"), lty=1:2, lwd=2, pch=c(1,22)) sentiment.by.gender.p = prop.table(sentiment.by.gender,1) plot(sentiment.by.gender.p[1,]*100, type="o",ylim=c(0,100),ann=FALSE,xaxt='n') lines(sentiment.by.gender.p[2,]*100, type="o", pch=22, lty=2) axis(1, at=1:3, labels=c("negative","neutral","positive")) legend("topright", c("female","male"), lty=1:2, lwd=2, pch=c(1,22)) -- Create a line graph for standardness by gender using percentages (or you can try absolute frequencies) ******************* * PART 3 - TESTS * ******************* ### Chi-sqare test(s) ### chi1=c(506,2930) names(chi1)=c("tutorial","workshop") chisq.test(chi1, p=c(0.5, 0.5)) # test whose results are reported on slide 2; goodness-of-fit test, compares two frequencies without looking at any independent variables that might influence them; probabilities expected under the null hypothesis (or some other criterion) must be provided (1/2 is also fine instead of 0.5) chi1.results = chisq.test(chi1, p=c(0.5, 0.5)) # tests can also be saved as objects chi1.results # shows the results str(chi1.results) # shows more detailed info about the results chisq.test(L.by.gender) # if crosstabs are invoked, chi.test does a test of independence between two categorical variables; this is a very frquent test in corpus linguistics summary(L.by.gender) # if the data is in crosstabs table format, summary will also provide the results of the chi-square test L.by.T.standardness.1000=table(tviti$standard_ling, tviti$standard_tech) chisq.test(L.by.T.standardness.1000) summary(L.by.T.standardness.1000) -- Do chi-square tests to see if sentiment and gender and linguistic standardness and technical standardness are related or independent (use either tviti or tviti.freq.csv) ### Correlation ### cor(tviti$words, tviti$tokens, method="spearman") # obtain correlation coefficient only; when parametric assumptions are fulfilled use method="pearson" cor.test(tviti$words, tviti$tokens, method="spearman") # obtain additional information such as p value cor.test(tviti$standard_tech_n, tviti$standard_ling_n, method="spearman") # compare to chisq.test(L.by.T.standardness.1000) above cor.test(tviti$standard_ling_n, tviti$punct, method="spearman") # compare to kruskal.test(tviti$punct ~ tviti$standard_ling) below -- Test the correlation between linguistic standardness and number of emoticons, between technical standardness and length in characters, and between linguistic standardness and number of commas ### Wilcoxon rank sum and t-test ### wilcox.test(tviti$words ~ tviti$source) wilcox.test(tviti$words[tviti$standard_ling=="L2"|tviti$standard_ling=="L3"] ~ tviti$standard_ling[tviti$standard_ling=="L2"|tviti$standard_ling=="L3"]) # choosing two levels of a three-level variable using or (|) t.test(tviti$words ~ tviti$source) # parametric assumptions need to be met - Compare the mean/median number of words by gender; compare the mean/median number of emoticons by source ### Kruskal-Wallis test and one-way ANOVA ### kruskal.test(tviti$punct ~ tviti$standard_ling) # nonparametric oneway.test(tviti$punct ~ tviti$standard_ling) # parametric - Compare the mean/median number of emoticons by sentiment and by linguistic and technical standardness level ### Normality check - Shapiro-Wilk test ### shapiro.test(tviti$words) # if the result is significant, the distribution is NOT normal, nonparametric test needed tapply(tviti$words,tviti$standard_ling,shapiro.test) # applying the test to several data subsets (here to the three levels of linguistic standardness) -- Check normality for number of characters, overall and by technical standardness ### Equality of dispersion check - Ansari-Bradley test ### ansari.test(tviti$words, tviti$tokens) # a non-parametric tests of dispersions in two datasets; if significant, the dispersions are NOT equal ansari.test(tviti$words[tviti$gender=="female"], tviti$words[tviti$gender=="male"]) -- Check whether the number of words has similar dispersions in corporate and private tweets ******************************** * RECAP FROM THE INTRO HANDOUT * ******************************** > # command prompt + # prompt for command continuation help(function.name) or ?function.name # get help on a function getwd() # see where your working directory is ls() or objects() # list the current contents of the workspace search() # see currently loaded packages (as well as attached objects, see below) library() # see installed packages install.packages("package.name") # install a package library(package.name) # load a package chooseCRANmirror() # choose an installation mirror help(package="package.name") # get help on a package data(data.object) # load an example set from a package = <- -> # assignment operators for data objects c() # concatenate a set of values separated by a comma; text has to be in quotation marks data.frame(col.a=c(1,1,2,3), col.b=c(2,4,6,8)) -> somedata # create a data frame called somedata from vectors labelled col.a and col.b somedata$col.b # refer to a specific column in a data frame somedata$col.b[somedata$col.a==1] # refer to specific values in a column in a data frame read.table("tablename.txt", sep = ",", header=TRUE) # import a table with a header; separator specified as comma, other options available read.csv("tablename.csv", header=TRUE) # import a table with with comma-separated fields with a header read.delim("tablename.txt", header=TRUE) # import a table with tab-delimited fields with a header head(somedata) # show the first rows of the data object summary(somedata) # list basic info about the data object options(OutDec= ",") # see output with commas as decimal marks quit() or q() # exit R ********************** * OTHER USEFUL STUFF * ********************** ### Main mathematical operators ### + - * / == > < >= <= ### Main logical operators ### & # and | # or ! # negation (e.g. != means 'is not'; note the single rather than double =) ### Varia ### head(somdata,2) # define the number of rows shown, default is 6 tail(somedata) # show the 6 bottom rows of the data object (or set a number as above) sort(somedata) # sort data in a vector colnames(somedata) # list column names class(somedata) # find the class object X belongs to (vector, data.frame...) read.csv2("tablename.csv", header=TRUE) # import a table with fields separated by semi-colons (;), and comma as the decimal mark read.delim2("tablename.csv", header=TRUE) # import a table with tab-delimited fields and comma as the decimal mark