library(pander)
## Warning: package 'pander' was built under R version 4.4.1
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## Warning: package 'ggplot2' was built under R version 4.4.1
## Warning: package 'tibble' was built under R version 4.4.1
## Warning: package 'tidyr' was built under R version 4.4.1
## Warning: package 'readr' was built under R version 4.4.1
## Warning: package 'purrr' was built under R version 4.4.1
## Warning: package 'dplyr' was built under R version 4.4.1
## Warning: package 'stringr' was built under R version 4.4.1
## Warning: package 'forcats' was built under R version 4.4.1
## Warning: package 'lubridate' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringi)

URL <- "https://raw.githubusercontent.com/DS4PS/cpp-527-fall-2020/master/labs/data/medium-data-utf8-v2.csv"
d <- read.csv( URL )

preview.these <- c("title", "subtitle", "claps", "reading_time", "publication", "date")
head( d[preview.these] ) %>% pander
Table continues below
title subtitle claps reading_time
A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model 850 8
Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric 1100 9
How to Use ggplot2 in Python A Grammar of Graphics for Python 767 5
Databricks: How to Save Files in CSV on Your Local Computer When I work on Python projects dealing… 354 4
A Step-by-Step Implementation of Gradient Descent and Backpropagation One example of building neural… 211 4
An Easy Introduction to SQL for Data Scientists 563 8
publication date
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
d$title <- gsub( "\\s", " ", d$title )
head( d[preview.these]) %>% pander
Table continues below
title subtitle claps reading_time
A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model 850 8
Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric 1100 9
How to Use ggplot2 in Python A Grammar of Graphics for Python 767 5
Databricks: How to Save Files in CSV on Your Local Computer When I work on Python projects dealing… 354 4
A Step-by-Step Implementation of Gradient Descent and Backpropagation One example of building neural… 211 4
An Easy Introduction to SQL for Data Scientists 563 8
publication date
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
Towards Data Science 5/30/2019
d$title <- gsub("<[^>]+>", "", d$title)
d$title <- gsub( "<U+200A>—<U+200A>", "", d$title)
hist( d$clap, main="Raw Clap Count",
      xlim=c(1,950), breaks=5000, col="gray20", border="white" )

hist( log10(d$clap+1), main="Logged Clap Score",
      col="gray20", border="white", breaks=100 )

d$clap.score <- log10( d$clap + 1 )
d$power_list <- grepl("^[0-9]+\\s", d$title)

d$how_to <- grepl("^How( to)?\\b", d$title)

d$colon_style <- grepl(":", d$title)

d$question <- grepl("\\?$", d$title)

d$other <- !(d$power_list | d$how_to | d$colon_style | d$question)

Q1:A

avg_clap_scores <- data.frame(
  Title_Type = c("Power List", "How-To Guide", "Colon Style", "Question", "Other"),
  Avg_Clap_Score = c(
    mean(d$clap.score[d$power_list], na.rm=TRUE),
    mean(d$clap.score[d$how_to], na.rm=TRUE),
    mean(d$clap.score[d$colon_style], na.rm=TRUE),
    mean(d$clap.score[d$question], na.rm=TRUE),
    mean(d$clap.score[d$other], na.rm=TRUE)
  )
)

print(avg_clap_scores)
##     Title_Type Avg_Clap_Score
## 1   Power List       2.165979
## 2 How-To Guide       2.152929
## 3  Colon Style       1.966616
## 4     Question       1.944782
## 5        Other       2.045855

Q1:B

Titles that use the “Power List” format have the highest clap score

d$title <- tolower(d$title)
word.list <- strsplit(d$title, " ")
word.vector <- unlist(word.list)
word.counts <- table(word.vector)

top_words <- sort(word.counts, decreasing = TRUE)
top_25_words <- head(top_words, 25)

top_25_words %>% pander()
Table continues below
to the a how of and your in for you is with
1715 1667 1116 912 858 738 638 615 600 528 465 413
why what data i on from an can are my be learning using
345 298 290 289 253 222 218 190 186 180 177 168 157

Q2:B

get_first_word <- function(x) {
  words <- strsplit(x, " ")[[1]]
  return(words[1])  
}


first_words <- sapply(d$title, get_first_word)


first_word_counts <- table(first_words)


most_common_first_word <- sort(first_word_counts, decreasing = TRUE)[1]

most_common_first_word %>% pander()
how
742

Q2: C

last_words <- regmatches(d$title, regexpr("\\w+$", d$title))


last_word_counts <- table(last_words)


sorted_last_word_counts <- sort(last_word_counts, decreasing = TRUE)


sorted_last_word_counts[1] %>% pander
learning
74