D Cheatsheets and Quick Reference
D.1 openalexR quick reference
library(openalexR)
# Fetch works by journal (source)
works <- oa_fetch(
entity = "works",
primary_location.source.id = "S148561398",
from_publication_date = "2020-01-01",
to_publication_date = "2023-12-31",
type = "article"
)
# Fetch works by institution
works <- oa_fetch(
entity = "works",
authorships.institutions.id = "I63966007",
from_publication_date = "2022-01-01",
type = "article"
)
# Fetch works by topic
works <- oa_fetch(
entity = "works",
topics.id = "T10102",
from_publication_date = "2023-01-01"
)
# Fetch author profile
author <- oa_fetch(entity = "authors", id = "A5023888391")
# Fetch institution profile
inst <- oa_fetch(entity = "institutions", id = "I63966007")
# Sampling
works <- oa_fetch(
entity = "works",
primary_location.source.id = "S148561398",
options = list(sample = 500, seed = 42)
)D.2 quanteda pipeline
library(quanteda)
# Full text-analysis pipeline
corp <- corpus(df, docid_field = "id", text_field = "text")
toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE) |>
tokens_tolower() |>
tokens_remove(stopwords("en"))
dfmat <- dfm(toks) |>
dfm_trim(min_termfreq = 5, min_docfreq = 3)
# TF-IDF weighting
dfmat_tfidf <- dfm_tfidf(dfmat)
# Top features
topfeatures(dfmat, 20)
# Keyword-in-context
kwic(toks, pattern = "open access", window = 5)D.3 igraph / tidygraph / ggraph
library(igraph)
library(tidygraph)
library(ggraph)
# Build graph from edge list
g <- graph_from_data_frame(edges, directed = FALSE, vertices = nodes)
# Tidygraph conversion
tg <- as_tbl_graph(g) |>
activate(nodes) |>
mutate(
degree = centrality_degree(),
between = centrality_betweenness(),
community = group_leiden(resolution = 1.0)
)
# ggraph visualisation
ggraph(tg, layout = "fr") +
geom_edge_link(aes(alpha = weight), show.legend = FALSE) +
geom_node_point(aes(size = degree, colour = factor(community))) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
theme_void()D.4 Common data wrangling patterns
library(tidyverse)
# Unnest OpenAlex authorships
authors <- works |>
select(work_id = id, authorships) |>
unnest(authorships, names_sep = "_")
# Unnest OpenAlex topics
topics <- works |>
select(work_id = id, topics) |>
unnest(topics, names_sep = "_")
# Compute h-index
compute_h_index <- function(citations) {
citations <- sort(citations, decreasing = TRUE)
sum(citations >= seq_along(citations))
}
# Field normalisation
works |>
group_by(field, year) |>
mutate(
field_mean = mean(cited_by_count),
mncs = cited_by_count / pmax(field_mean, 1)
)
# Fractional counting
works |>
unnest(authorships, names_sep = "_") |>
group_by(id) |>
mutate(frac = 1 / n()) |>
ungroup()D.5 Companion package functions
The scientometricsInR package provides helper functions used throughout the book:
| Function | Purpose | Example |
|---|---|---|
fetch_openalex(...) |
Cached, rate-limited OpenAlex queries | fetch_openalex(entity = "works", ...) |
dedupe_by_doi(df) |
Remove duplicate records by DOI | works |> dedupe_by_doi() |
compute_h_index(citations) |
Compute h-index from citation vector |
compute_h_index(c(10, 8, 5, 3, 1)) → 3 |
field_normalize(cites, means) |
MNCS normalisation (vector) |
field_normalize(c(20, 5), c(10, 10)) → c(2.0, 0.5)
|
compute_mncs(df) |
MNCS for a grouped data frame | df |> compute_mncs() |
build_coauth_graph(works) |
Co-authorship igraph from works | build_coauth_graph(works) |
kleinberg_bursts(kw, dates) |
Keyword burst detection | kleinberg_bursts(keywords, dates) |
palette_sci(n) |
Viridis colour palette | palette_sci(4) |
theme_sci() |
Minimal ggplot2 theme | + theme_sci() |
This book was built by the bookdown R package.