Lab-03-Dodson

library(pander)

## Warning: package 'pander' was built under R version 4.4.1

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.1

## Warning: package 'ggplot2' was built under R version 4.4.1

## Warning: package 'tibble' was built under R version 4.4.1

## Warning: package 'tidyr' was built under R version 4.4.1

## Warning: package 'readr' was built under R version 4.4.1

## Warning: package 'purrr' was built under R version 4.4.1

## Warning: package 'dplyr' was built under R version 4.4.1

## Warning: package 'stringr' was built under R version 4.4.1

## Warning: package 'forcats' was built under R version 4.4.1

## Warning: package 'lubridate' was built under R version 4.4.1

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(stringi)

URL <- "https://raw.githubusercontent.com/DS4PS/cpp-527-fall-2020/master/labs/data/medium-data-utf8-v2.csv"
d <- read.csv( URL )

preview.these <- c("title", "subtitle", "claps", "reading_time", "publication", "date")
head( d[preview.these] ) %>% pander

Table continues below
title	subtitle	claps	reading_time
A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model		850	8
Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric		1100	9
How to Use ggplot2 in Python	A Grammar of Graphics for Python	767	5
Databricks: How to Save Files in CSV on Your Local Computer	When I work on Python projects dealing…	354	4
A Step-by-Step Implementation of Gradient Descent and Backpropagation	One example of building neural…	211	4
An Easy Introduction to SQL for Data Scientists		563	8

publication	date
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019

d$title <- gsub( "\\s", " ", d$title )
head( d[preview.these]) %>% pander

Table continues below
title	subtitle	claps	reading_time
A Beginner’s Guide to Word Embedding with Gensim Word2Vec Model		850	8
Hands-on Graph Neural Networks with PyTorch & PyTorch Geometric		1100	9
How to Use ggplot2 in Python	A Grammar of Graphics for Python	767	5
Databricks: How to Save Files in CSV on Your Local Computer	When I work on Python projects dealing…	354	4
A Step-by-Step Implementation of Gradient Descent and Backpropagation	One example of building neural…	211	4
An Easy Introduction to SQL for Data Scientists		563	8

publication	date
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019
Towards Data Science	5/30/2019

d$title <- gsub("<[^>]+>", "", d$title)

d$title <- gsub( "<U+200A>—<U+200A>", "", d$title)

hist( d$clap, main="Raw Clap Count",
      xlim=c(1,950), breaks=5000, col="gray20", border="white" )

hist( log10(d$clap+1), main="Logged Clap Score",
      col="gray20", border="white", breaks=100 )

d$clap.score <- log10( d$clap + 1 )

d$power_list <- grepl("^[0-9]+\\s", d$title)

d$how_to <- grepl("^How( to)?\\b", d$title)

d$colon_style <- grepl(":", d$title)

d$question <- grepl("\\?$", d$title)

d$other <- !(d$power_list | d$how_to | d$colon_style | d$question)

Q1:A

avg_clap_scores <- data.frame(
  Title_Type = c("Power List", "How-To Guide", "Colon Style", "Question", "Other"),
  Avg_Clap_Score = c(
    mean(d$clap.score[d$power_list], na.rm=TRUE),
    mean(d$clap.score[d$how_to], na.rm=TRUE),
    mean(d$clap.score[d$colon_style], na.rm=TRUE),
    mean(d$clap.score[d$question], na.rm=TRUE),
    mean(d$clap.score[d$other], na.rm=TRUE)
  )
)

print(avg_clap_scores)

##     Title_Type Avg_Clap_Score
## 1   Power List       2.165979
## 2 How-To Guide       2.152929
## 3  Colon Style       1.966616
## 4     Question       1.944782
## 5        Other       2.045855

Q1:B

Titles that use the “Power List” format have the highest clap score

d$title <- tolower(d$title)

word.list <- strsplit(d$title, " ")

word.vector <- unlist(word.list)
word.counts <- table(word.vector)

top_words <- sort(word.counts, decreasing = TRUE)
top_25_words <- head(top_words, 25)

top_25_words %>% pander()

Table continues below
to	the	a	how	of	and	your	in	for	you	is	with
1715	1667	1116	912	858	738	638	615	600	528	465	413

why	what	data	i	on	from	an	can	are	my	be	learning	using
345	298	290	289	253	222	218	190	186	180	177	168	157

Q2:B

get_first_word <- function(x) {
  words <- strsplit(x, " ")[[1]]
  return(words[1])  
}


first_words <- sapply(d$title, get_first_word)


first_word_counts <- table(first_words)


most_common_first_word <- sort(first_word_counts, decreasing = TRUE)[1]

most_common_first_word %>% pander()

how
742

Q2: C

last_words <- regmatches(d$title, regexpr("\\w+$", d$title))


last_word_counts <- table(last_words)


sorted_last_word_counts <- sort(last_word_counts, decreasing = TRUE)


sorted_last_word_counts[1] %>% pander

learning
74

Lab-03-Dodson

2025-02-03

Q1:A

Q1:B

Titles that use the “Power List” format have the highest clap score

Q2:B

Q2: C