4 Lattes Projects
rio::import('rawfiles/projetos.rds') |>
tibble::tibble() ->
projetos
# rio::export(projetos, 'rawfiles/projetos.xlsx')
projetos |>
dplyr::group_by(natureza, situacao) |>
tally(name = 'quantidade') |>
datatable(
extensions = 'Buttons',
rownames = F,
options = list(
dom = 'Bfrtip',
pageLength = 12,
buttons = list(list(
extend = 'collection',
buttons = list(list(extend = 'csv', filename = 'data'),
list(extend = 'excel', filename = 'data')),
text = 'Download'))))
projetos |>
dplyr::filter(natureza == 'PESQUISA') ->
pesq
pesq |>
dplyr::mutate(texto = paste(nome_do_projeto, descricao_do_projeto, sep = '. ')) |>
dplyr::mutate(title = textcleaner(nome_do_projeto)) |>
dplyr::distinct(title, .keep_all = TRUE) ->
pesq
pesq |>
dplyr::count(ano_inicio, name = 'quantidade') |>
datatable(
extensions = 'Buttons',
rownames = F,
options = list(
dom = 'Bfrtip',
pageLength = 26,
buttons = list(list(
extend = 'collection',
buttons = list(list(extend = 'csv', filename = 'data'),
list(extend = 'excel', filename = 'data')),
text = 'Download'))))
Project funders.
projetos |>
dplyr::select(nome_do_projeto, id, financiadores) |>
tidyr::unnest(financiadores) |>
dplyr::group_by(nome_do_projeto) |>
dplyr::summarise(nome_instituicao = nome_instituicao, codigo_instituicao = codigo_instituicao) |>
dplyr::filter(nome_instituicao != '') |>
dplyr::ungroup() |>
dplyr::group_by(nome_instituicao, codigo_instituicao) |>
dplyr::tally(name = 'qtde', sort = T) |>
{\(x) rio::export(x, 'rawfiles/lattes_projetos_financiadores.xlsx')}()
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()` always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
## `summarise()` has grouped output by 'nome_do_projeto'. You can override using
## the `.groups` argument.
4.1 Structural Topic Modeling
stm::textProcessor(documents = pesq$texto,
metadata = pesq[, c('nome_do_projeto', 'descricao_do_projeto', 'ano_inicio', 'ano_fim')],
lowercase = TRUE,
removestopwords = TRUE,
removenumbers = FALSE,
removepunctuation = TRUE,
stem = TRUE,
wordLengths = c(3, Inf),
sparselevel = 1,
language = "pt",
verbose = TRUE,
onlycharacter = TRUE,
striphtml = FALSE,
customstopwords = NULL,
v1 = FALSE) ->
pesq_prep
## Building corpus...
## Converting to Lower Case...
## Removing punctuation...
## Removing stopwords...
## Stemming...
## Creating Output...
# class(pesq_prep)
# names(pesq_prep)
stm::prepDocuments(pesq_prep$documents,
pesq_prep$vocab,
pesq_prep$meta,
lower.thresh = 3) ->
pesq_doc
## Removing 4185 of 5589 terms (6066 of 24883 tokens) due to frequency
## Your corpus now has 470 documents, 1404 terms and 18817 tokens.
# names(pesq_doc)
# pesq_prep$documents |> head()
# pesq_prep$vocab |> head()
# pesq_prep$meta |> head()
# ------------------------------
## search K topics
# tictoc::tic()
# stm::searchK(documents = pesq_doc$documents,
# vocab = pesq_doc$vocab,
# K = c(2, 5, 10:15, 20, 25, 30, 40, 50, 60),
# N = 100,
# proportion = 0.5,
# heldout.seed = 1234,
# M = 10,
# cores = 1,
# # prevalence = ~ ano_inicio,
# max.em.its = 75,
# data = pesq_doc$meta,
# init.type = "Spectral",
# verbose = F) ->
# pesq_searchK
# tictoc::toc()
#
# rio::export(pesq_searchK, 'rawfiles/pesq_searchK_mod1.rds')
rio::import('rawfiles/pesq_searchK_mod1.rds') ->
pesq_searchK
data.frame(K = unlist(pesq_searchK$results$K),
semcoh = unlist(pesq_searchK$results$semcoh),
exclus = unlist(pesq_searchK$results$exclus)) ->
res
res$cor <- ifelse(res$K %in% c(10, 14, 25), 'selecionado', 'nao')
ggplot2::ggplot(res, aes(x = semcoh, y = exclus)) +
ggplot2::geom_point(shape = 21, size = 3, position = "identity") +
ggplot2::geom_line() +
ggrepel::geom_text_repel(data = res, aes(label = K), size = 6) +
ggplot2::geom_vline(xintercept = res[res$cor == 'selecionado', 'semcoh'], linetype = 'dashed') +
ggplot2::labs(x = 'Semantic Coherence', y = 'Exclusivity', title = 'Model 1') +
ggplot2::theme_classic() +
ggplot2::theme(legend.position = "none")
Kuhn (2018) also chose the amount of topics using the same type of graph.
15 topics
# ------------------------------
## Modeling
stm(documents = pesq_doc$documents,
vocab = pesq_doc$vocab,
K = 15,
data = pesq_doc$meta,
verbose = F) ->
pesq_stm_fit
tidytext::tidy(pesq_stm_fit) |>
dplyr::arrange(beta) |>
dplyr::group_by(topic) |>
dplyr::top_n(10, beta) |>
dplyr::arrange(-beta) |>
dplyr::select(topic, term) |>
dplyr::summarise(terms = list(term)) |>
dplyr::mutate(terms = purrr::map(terms, paste, collapse = ", ")) |>
tidyr::unnest() ->
top_terms
## Warning: `cols` is now required when using `unnest()`.
## ℹ Please use `cols = c(terms)`.
n_documents <- dim(pesq_stm_fit$theta)[1]
tidytext::tidy(pesq_stm_fit, matrix = "gamma") |>
dplyr::group_by(document) |>
dplyr::arrange(dplyr::desc(gamma)) |>
dplyr::slice_head(n = 1) |>
dplyr::ungroup() |>
dplyr::group_by(topic) |>
dplyr::summarise(topic_proportion = n() / n_documents) ->
topic_proportion
topic_proportion |>
left_join(top_terms, by = "topic") |>
mutate(topic = paste0("Topic ", topic)) |>
dplyr::arrange(dplyr::desc(topic_proportion)) ->
topic_proportion2
topic_proportion2 |>
dplyr::mutate(topic = tolower(.data$topic)) |>
dplyr::mutate(topic = gsub(' ', '_', .data$topic)) |>
dplyr::mutate(topic_proportion = round(.data$topic_proportion, digits = 3)) ->
topic_proportion3
rio::export(topic_proportion3, '~/Sync/pirarucu/topic_proportion.xlsx')
topic_proportion3 |>
DT::datatable(
extensions = 'Buttons',
rownames = F,
options = list(
dom = 'Bfrtip',
pageLength = 15,
buttons = list(list(
extend = 'collection',
buttons = list(list(extend = 'csv', filename = 'data'),
list(extend = 'excel', filename = 'data')),
text = 'Download'))))
tidytext::tidy(pesq_stm_fit, matrix = "theta", document_names = pesq_prep$meta$nome_do_projeto) |>
dplyr::left_join(pesq |> dplyr::rename(document = nome_do_projeto)) |>
dplyr::distinct(.keep_all = TRUE) |>
dplyr::arrange(document, topic) ->
gamma_documents
gamma_documents |>
dplyr::group_by(topic) |>
dplyr::arrange(dplyr::desc(gamma)) |>
dplyr::slice_head(n = 200) |>
dplyr::ungroup() ->
gamma_documents
gamma_documents |>
dplyr::mutate(topic = paste('topic', topic, sep = '_')) |>
dplyr::mutate(gamma = round(.data$gamma, digits = 3)) |>
dplyr::select(document, topic, gamma, ano_inicio, descricao_do_projeto) ->
tab_top_15
rio::export(tab_top_15, 'rawfiles/stm_tab_top_15.xlsx')
tab_top_15 |>
DT::datatable(
rownames = FALSE,
filter = 'bottom',
extensions = 'Buttons',
options = list(
dom = 'Blfrtip', pageLength = 5,
columnDefs = list(list(visible = FALSE, targets = c(4))),
buttons = list(list(extend = 'colvis', columns = c(0, 1, 2, 4)))
))
References
KUHN, K. D. Using structural topic modeling to identify latent topics and trends in aviation incident reports. Transportation Research Part C: Emerging Technologies, v. 87, p. 105–122, fev. 2018.