Analyze dataset

About

Description

This section is by far the busiest of the four that precede the article. In this section bar charts and maps are created for each of the groups of terms, and chi-squared tests are conducted on each to determine the nature of each term’s distribution. These three features serve as the foundation for all prose analysis about the data.

Usage

After loading in the necessary packages, code should be run for each updated CSV creating a bar chart, a contingency table, and a map, as well as code conducting a chi-square test and further analysis on the results of this test. The chunks necessary for these actions are labeled below, and the only thing that needs to be changed out is the name of the updated CSV.

Setup

# Script-specific options or packages
library(tidyverse)     # data manipulation
library(knitr)         # table output

library(janitor)
library(patchwork)
library(broom)
library(effectsize)

Run

plurals <- read_csv(file = "../data/original/twitter/plurals.csv")

## Warning: One or more parsing issues, see `problems()` for details

sales <- read_csv(file = "../data/original/twitter/sales.csv")

## Warning: One or more parsing issues, see `problems()` for details

shoes <- read_csv(file = "../data/original/twitter/shoes.csv")

## Warning: One or more parsing issues, see `problems()` for details

tonix <- read_csv(file = "../data/original/twitter/tonix.csv")

## Warning: One or more parsing issues, see `problems()` for details

roads <- read_csv(file = "../data/original/twitter/roads.csv")

## Warning: One or more parsing issues, see `problems()` for details

updated_plurals <- read_csv(file = "../data/derived/updated_plurals.csv")
updated_sales <- read_csv(file = "../data/derived/updated_sales.csv")
updated_shoes <- read_csv(file = "../data/derived/updated_shoes.csv")
updated_tonix <- read_csv(file = "../data/derived/updated_tonix.csv")

## Warning: One or more parsing issues, see `problems()` for details

updated_roads <- read_csv(file = "../data/derived/updated_roads.csv")

## Warning: One or more parsing issues, see `problems()` for details

Second Person Plurals

updated_plurals %>% 
  tabyl(search_term, census_region) %>% # cross-tabulate
  adorn_totals(c("row", "col")) %>% # provide row and column totals
  adorn_percentages("col") %>% # add percentages to the columns
  adorn_pct_formatting(rounding = "half up", digits = 0) %>% # round the digits
  adorn_ns() %>% # add observation number
  adorn_title("combined") %>% # add a header title
  kable(booktabs = TRUE, # pretty table
        caption = "Contingency table for `search_term` and `census_region`.") # caption

Table 1: Contingency table for `search_term` and `census_region`.
search_term/census_region	Midwest	Northeast	South	West	Total
“you guys”	43% (9)	44% (7)	20% (13)	36% (10)	30% (39)
y’all	24% (5)	25% (4)	33% (21)	36% (10)	31% (40)
yall	33% (7)	31% (5)	47% (30)	29% (8)	39% (50)
Total	100% (21)	100% (16)	100% (64)	100% (28)	100% (129)

p1 <- 
  updated_plurals %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar() + # geometry
  labs(y = "Count", x = "Census Region") # labels

p2 <- 
  updated_plurals %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar(position = "fill") + # geometry, with fill for proportion plot
  labs(y = "Proportion", x = "Census Region", fill = "Modality") # labels

p1 <- p1 + theme(legend.position = "none") # remove legend from left plot

p1 + p2 + plot_annotation("Distribution of Second Person Plurals Across the United States")

ror_mod_table <- 
  xtabs(formula = ~ census_region + search_term, # formula 
        data = updated_plurals) # dataset

c2 <- chisq.test(ror_mod_table) # apply the chi-squared test to `ror_mod_table`

c2 # # preview the test results

## 
##  Pearson's Chi-squared test
## 
## data:  ror_mod_table
## X-squared = 7.4697, df = 6, p-value = 0.2796

#> 
#>  Pearson's Chi-squared test with Yates' continuity correction
#> 
#> data:  ror_mod_table
#> X-squared = 101, df = 1, p-value <2e-16

c2$p.value < .05 # confirm p-value below .05

## [1] FALSE

#> [1] TRUE

c2 %>% # statistical result
  augment() # view detailed statistical test information

effects <- effectsize(c2)  # evaluate effect size and generate a confidence interval

effects  # preview effect size and confidence interval

#> Cramer's V |       95% CI
#> -------------------------
#> 0.18       | [0.14, 0.21]

interpret_r(effects$Cramers_v)  # interpret the effect size

## [1] "small"
## (Rules: funder2019)

#> [1] "small"
#> (Rules: funder2019)

states_map <- map_data("state")  # from ggplot2

p <- ggplot() + geom_polygon(data = states_map, aes(x = long, y = lat, group = group),
    fill = "grey", color = "black") + labs(title = "Tweets in the USA", subtitle = "Second Person Plurals")

p + geom_point(data = plurals, aes(x = lng, y = lat, group = 1, color = search_term),
    alpha = 1/2, size = 1.5)

Outdoor Sales

updated_sales %>% 
  tabyl(search_term, census_region) %>% # cross-tabulate
  adorn_totals(c("row", "col")) %>% # provide row and column totals
  adorn_percentages("col") %>% # add percentages to the columns
  adorn_pct_formatting(rounding = "half up", digits = 0) %>% # round the digits
  adorn_ns() %>% # add observation number
  adorn_title("combined") %>% # add a header title
  kable(booktabs = TRUE, # pretty table
        caption = "Contingency table for `search_term` and `census_region`.") # caption

Table 2: Contingency table for `search_term` and `census_region`.
search_term/census_region	Midwest	Northeast	South	West	Total
“garage sale”	50% (3)	67% (2)	45% (5)	43% (6)	47% (16)
“yard sale”	50% (3)	33% (1)	55% (6)	57% (8)	53% (18)
Total	100% (6)	100% (3)	100% (11)	100% (14)	100% (34)

p1 <- 
  updated_sales %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar() + # geometry
  labs(y = "Count", x = "Census Region") # labels

p2 <- 
  updated_sales %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar(position = "fill") + # geometry, with fill for proportion plot
  labs(y = "Proportion", x = "Census Region", fill = "Modality") # labels

p1 <- p1 + theme(legend.position = "none") # remove legend from left plot

p1 + p2 + plot_annotation("Distribution of Terms for Outdoor Sales Across the US")

ror_mod_table <- 
  xtabs(formula = ~ census_region + search_term, # formula 
        data = updated_sales) # dataset

c2 <- chisq.test(ror_mod_table) # apply the chi-squared test to `ror_mod_table`

c2 # # preview the test results

## 
##  Pearson's Chi-squared test
## 
## data:  ror_mod_table
## X-squared = 0.59437, df = 3, p-value = 0.8977

#> 
#>  Pearson's Chi-squared test with Yates' continuity correction
#> 
#> data:  ror_mod_table
#> X-squared = 101, df = 1, p-value <2e-16

c2$p.value < .05 # confirm p-value below .05

## [1] FALSE

#> [1] TRUE

c2 %>% # statistical result
  augment() # view detailed statistical test information

effects <- effectsize(c2)  # evaluate effect size and generate a confidence interval

effects  # preview effect size and confidence interval

#> Cramer's V |       95% CI
#> -------------------------
#> 0.18       | [0.14, 0.21]

interpret_r(effects$Cramers_v)  # interpret the effect size

## [1] "small"
## (Rules: funder2019)

#> [1] "small"
#> (Rules: funder2019)

states_map <- map_data("state")  # from ggplot2

p <- ggplot() + geom_polygon(data = states_map, aes(x = long, y = lat, group = group),
    fill = "grey", color = "black") + labs(title = "Tweets in the USA", subtitle = "Outdoor Sales")

p + geom_point(data = sale, aes(x = lng, y = lat, group = 1, color = search_term),
    alpha = 1/2, size = 1.5)

Gym Shoes

updated_shoes %>% 
  tabyl(search_term, census_region) %>% # cross-tabulate
  adorn_totals(c("row", "col")) %>% # provide row and column totals
  adorn_percentages("col") %>% # add percentages to the columns
  adorn_pct_formatting(rounding = "half up", digits = 0) %>% # round the digits
  adorn_ns() %>% # add observation number
  adorn_title("combined") %>% # add a header title
  kable(booktabs = TRUE, # pretty table
        caption = "Contingency table for `search_term` and `census_region`.") # caption

Table 3: Contingency table for `search_term` and `census_region`.
search_term/census_region	Midwest	Northeast	South	West	Total
“tennis shoes”	75% (6)	0% (0)	38% (11)	37% (7)	34% (24)
sneakers	25% (2)	100% (15)	62% (18)	63% (12)	66% (47)
Total	100% (8)	100% (15)	100% (29)	100% (19)	100% (71)

p1 <- 
  updated_shoes %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar() + # geometry
  labs(y = "Count", x = "Census Region") # labels

p2 <- 
  updated_shoes %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar(position = "fill") + # geometry, with fill for proportion plot
  labs(y = "Proportion", x = "Census Region", fill = "Modality") # labels

p1 <- p1 + theme(legend.position = "none") # remove legend from left plot

p1 + p2 + plot_annotation("Distribution of Terms for Gym Shoes Across the United States")

ror_mod_table <- 
  xtabs(formula = ~ census_region + search_term, # formula 
        data = updated_shoes) # dataset

c2 <- chisq.test(ror_mod_table) # apply the chi-squared test to `ror_mod_table`

## Warning in stats::chisq.test(x, y, ...): Chi-squared approximation may be incorrect

c2 # # preview the test results

## 
##  Pearson's Chi-squared test
## 
## data:  ror_mod_table
## X-squared = 14.027, df = 3, p-value = 0.002869

#> 
#>  Pearson's Chi-squared test with Yates' continuity correction
#> 
#> data:  ror_mod_table
#> X-squared = 101, df = 1, p-value <2e-16

c2$p.value < .05 # confirm p-value below .05

## [1] TRUE

#> [1] TRUE

c2 %>% # statistical result
  augment() # view detailed statistical test information

effects <- effectsize(c2)  # evaluate effect size and generate a confidence interval

effects  # preview effect size and confidence interval

#> Cramer's V |       95% CI
#> -------------------------
#> 0.18       | [0.14, 0.21]

interpret_r(effects$Cramers_v)  # interpret the effect size

## [1] "very large"
## (Rules: funder2019)

#> [1] "small"
#> (Rules: funder2019)

states_map <- map_data("state")  # from ggplot2

p <- ggplot() + geom_polygon(data = states_map, aes(x = long, y = lat, group = group),
    fill = "grey", color = "black") + labs(title = "Tweets in the USA", subtitle = "Shoes")

p + geom_point(data = shoes, aes(x = lng, y = lat, group = 1, color = search_term),
    alpha = 1/2, size = 1.5)

Soft Drinks

updated_tonix %>% 
  tabyl(search_term, census_region) %>% # cross-tabulate
  adorn_totals(c("row", "col")) %>% # provide row and column totals
  adorn_percentages("col") %>% # add percentages to the columns
  adorn_pct_formatting(rounding = "half up", digits = 0) %>% # round the digits
  adorn_ns() %>% # add observation number
  adorn_title("combined") %>% # add a header title
  kable(booktabs = TRUE, # pretty table
        caption = "Contingency table for `search_term` and `census_region`.") # caption

Table 4: Contingency table for `search_term` and `census_region`.
search_term/census_region	Midwest	Northeast	South	West	Total
coke	49% (22)	43% (16)	37% (36)	27% (17)	37% (91)
pop	24% (11)	19% (7)	30% (29)	27% (17)	26% (64)
soda	27% (12)	38% (14)	34% (33)	46% (29)	36% (88)
Total	100% (45)	100% (37)	100% (98)	100% (63)	100% (243)

p1 <- 
  updated_tonix %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar() + # geometry
  labs(y = "Count", x = "Census Region") # labels

p2 <- 
  updated_tonix %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar(position = "fill") + # geometry, with fill for proportion plot
  labs(y = "Proportion", x = "Census Region", fill = "Modality") # labels

p1 <- p1 + theme(legend.position = "none") # remove legend from left plot

p1 + p2 + plot_annotation("Distribution of Terms for Soft Drinks Across the US")

ror_mod_table <- 
  xtabs(formula = ~ census_region + search_term, # formula 
        data = updated_tonix) # dataset

c2 <- chisq.test(ror_mod_table) # apply the chi-squared test to `ror_mod_table`

c2 # # preview the test results

## 
##  Pearson's Chi-squared test
## 
## data:  ror_mod_table
## X-squared = 8.0096, df = 6, p-value = 0.2374

#> 
#>  Pearson's Chi-squared test with Yates' continuity correction
#> 
#> data:  ror_mod_table
#> X-squared = 101, df = 1, p-value <2e-16

c2$p.value < .05 # confirm p-value below .05

## [1] FALSE

#> [1] TRUE

c2 %>% # statistical result
  augment() # view detailed statistical test information

effects <- effectsize(c2)  # evaluate effect size and generate a confidence interval

effects  # preview effect size and confidence interval

#> Cramer's V |       95% CI
#> -------------------------
#> 0.18       | [0.14, 0.21]

interpret_r(effects$Cramers_v)  # interpret the effect size

## [1] "small"
## (Rules: funder2019)

#> [1] "small"
#> (Rules: funder2019)

states_map <- map_data("state")  # from ggplot2

p <- ggplot() + geom_polygon(data = states_map, aes(x = long, y = lat, group = group),
    fill = "grey", color = "black") + labs(title = "Tweets in the USA", subtitle = "Soft Drinks")

p + geom_point(data = tonix, aes(x = lng, y = lat, group = 1, color = search_term),
    alpha = 1/2, size = 1.5)

## Warning: Removed 5724 rows containing missing values (geom_point).

Major Roads

updated_roads %>% 
  tabyl(search_term, census_region) %>% # cross-tabulate
  adorn_totals(c("row", "col")) %>% # provide row and column totals
  adorn_percentages("col") %>% # add percentages to the columns
  adorn_pct_formatting(rounding = "half up", digits = 0) %>% # round the digits
  adorn_ns() %>% # add observation number
  adorn_title("combined") %>% # add a header title
  kable(booktabs = TRUE, # pretty table
        caption = "Contingency table for `search_term` and `census_region`.") # caption

Table 5: Contingency table for `search_term` and `census_region`.
search_term/census_region	Midwest	Northeast	South	West	Total
freeway	26% (6)	18% (4)	61% (39)	61% (60)	52% (109)
highway	74% (17)	82% (18)	39% (25)	39% (39)	48% (99)
Total	100% (23)	100% (22)	100% (64)	100% (99)	100% (208)

p1 <- 
  updated_roads %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar() + # geometry
  labs(y = "Count", x = "Census Region") # labels

p2 <- 
  updated_roads %>% # dataset
  ggplot(aes(x = census_region, fill = search_term)) + # mappings
  geom_bar(position = "fill") + # geometry, with fill for proportion plot
  labs(y = "Proportion", x = "Census Region", fill = "Modality") # labels

p1 <- p1 + theme(legend.position = "none") # remove legend from left plot

p1 + p2 + plot_annotation("Distribution of Terms for Major Roads Across the US")

ror_mod_table <- 
  xtabs(formula = ~ census_region + search_term, # formula 
        data = updated_roads) # dataset

c2 <- chisq.test(ror_mod_table) # apply the chi-squared test to `ror_mod_table`

c2 # # preview the test results

## 
##  Pearson's Chi-squared test
## 
## data:  ror_mod_table
## X-squared = 21.255, df = 3, p-value = 9.317e-05

#> 
#>  Pearson's Chi-squared test with Yates' continuity correction
#> 
#> data:  ror_mod_table
#> X-squared = 101, df = 1, p-value <2e-16

c2$p.value < .05 # confirm p-value below .05

## [1] TRUE

#> [1] TRUE

c2 %>% # statistical result
  augment() # view detailed statistical test information

effects <- effectsize(c2)  # evaluate effect size and generate a confidence interval

effects  # preview effect size and confidence interval

#> Cramer's V |       95% CI
#> -------------------------
#> 0.18       | [0.14, 0.21]

interpret_r(effects$Cramers_v)  # interpret the effect size

## [1] "large"
## (Rules: funder2019)

#> [1] "small"
#> (Rules: funder2019)

states_map <- map_data("state")  # from ggplot2

p <- ggplot() + geom_polygon(data = states_map, aes(x = long, y = lat, group = group),
    fill = "grey", color = "black") + labs(title = "Tweets in the USA", subtitle = "Major Roads")

p + geom_point(data = roads, aes(x = lng, y = lat, group = 1, color = search_term),
    alpha = 1/2, size = 1.5)

Finalize

Log

This code should produce bar charts, contingency tables, maps, and information about each chi-squaured test conducted.

References

Vaux, Bert. “Harvard Dialect Survey.” Harvard Dialect Survey, 2003, http://dialect.redlog.net.