library(pander)
## Warning: package 'pander' was built under R version 4.4.1
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.1
## Warning: package 'ggplot2' was built under R version 4.4.1
## Warning: package 'tibble' was built under R version 4.4.1
## Warning: package 'tidyr' was built under R version 4.4.1
## Warning: package 'readr' was built under R version 4.4.1
## Warning: package 'purrr' was built under R version 4.4.1
## Warning: package 'dplyr' was built under R version 4.4.1
## Warning: package 'stringr' was built under R version 4.4.1
## Warning: package 'forcats' was built under R version 4.4.1
## Warning: package 'lubridate' was built under R version 4.4.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(stringi)
URL <- "https://raw.githubusercontent.com/DS4PS/cpp-527-fall-2020/master/labs/data/medium-data-utf8-v2.csv"
d <- read.csv( URL )
preview.these <- c("title", "subtitle", "claps", "reading_time", "publication", "date")
head( d[preview.these] ) %>% pander
Table continues below
| A Beginner’s Guide to Word Embedding with Gensim
Word2Vec Model |
|
850 |
8 |
| Hands-on Graph Neural Networks with PyTorch &
PyTorch Geometric |
|
1100 |
9 |
| How to Use ggplot2 in Python |
A Grammar of Graphics for Python |
767 |
5 |
| Databricks: How to Save Files in CSV on Your
Local Computer |
When I work on Python projects dealing… |
354 |
4 |
| A Step-by-Step Implementation of Gradient Descent and
Backpropagation |
One example of building neural… |
211 |
4 |
| An Easy Introduction to SQL for Data Scientists |
|
563 |
8 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
d$title <- gsub( "\\s", " ", d$title )
head( d[preview.these]) %>% pander
Table continues below
| A Beginner’s Guide to Word Embedding with Gensim
Word2Vec Model |
|
850 |
8 |
| Hands-on Graph Neural Networks with PyTorch &
PyTorch Geometric |
|
1100 |
9 |
| How to Use ggplot2 in Python |
A Grammar of Graphics for Python |
767 |
5 |
| Databricks: How to Save Files in CSV on Your
Local Computer |
When I work on Python projects dealing… |
354 |
4 |
| A Step-by-Step Implementation of Gradient Descent and
Backpropagation |
One example of building neural… |
211 |
4 |
| An Easy Introduction to SQL for Data Scientists |
|
563 |
8 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
| Towards Data Science |
5/30/2019 |
d$title <- gsub("<[^>]+>", "", d$title)
d$title <- gsub( "<U+200A>—<U+200A>", "", d$title)
hist( d$clap, main="Raw Clap Count",
xlim=c(1,950), breaks=5000, col="gray20", border="white" )

hist( log10(d$clap+1), main="Logged Clap Score",
col="gray20", border="white", breaks=100 )

d$clap.score <- log10( d$clap + 1 )
d$power_list <- grepl("^[0-9]+\\s", d$title)
d$how_to <- grepl("^How( to)?\\b", d$title)
d$colon_style <- grepl(":", d$title)
d$question <- grepl("\\?$", d$title)
d$other <- !(d$power_list | d$how_to | d$colon_style | d$question)
Q1:A
avg_clap_scores <- data.frame(
Title_Type = c("Power List", "How-To Guide", "Colon Style", "Question", "Other"),
Avg_Clap_Score = c(
mean(d$clap.score[d$power_list], na.rm=TRUE),
mean(d$clap.score[d$how_to], na.rm=TRUE),
mean(d$clap.score[d$colon_style], na.rm=TRUE),
mean(d$clap.score[d$question], na.rm=TRUE),
mean(d$clap.score[d$other], na.rm=TRUE)
)
)
print(avg_clap_scores)
## Title_Type Avg_Clap_Score
## 1 Power List 2.165979
## 2 How-To Guide 2.152929
## 3 Colon Style 1.966616
## 4 Question 1.944782
## 5 Other 2.045855
Q1:B
Q2:B
get_first_word <- function(x) {
words <- strsplit(x, " ")[[1]]
return(words[1])
}
first_words <- sapply(d$title, get_first_word)
first_word_counts <- table(first_words)
most_common_first_word <- sort(first_word_counts, decreasing = TRUE)[1]
most_common_first_word %>% pander()
Q2: C
last_words <- regmatches(d$title, regexpr("\\w+$", d$title))
last_word_counts <- table(last_words)
sorted_last_word_counts <- sort(last_word_counts, decreasing = TRUE)
sorted_last_word_counts[1] %>% pander