13  Raspagem OJS

utils.R
Código
getIssues <- function(url) {

    print(url)

    read_html(url) %>>% 
        html_nodes(xpath = '//*[@id="issues"]') %>>%
        html_nodes('a') %>>% 
        html_attr("href") %>>% 
        (unique(.)) %>>% 
        (. -> page1)

    pages <- c(url, page1[grepl('Page', page1)])

    as.list(pages)

    resultado <- list()

    for(i in seq_along(pages)) {

        read_html(pages[[i]]) %>>% 
            html_nodes(xpath = '//*[@id="issues"]') %>>%
            html_nodes('a') %>>% 
            html_attr("href") %>>% 
            (unique(.)) %>>% 
            (. -> issues)

        resultado[[i]] <- issues[!grepl('Page', issues)]

    }

    return(paste0(unlist(resultado), '/showToc'))

}


getSummaries <- function(url) {

    print(url)

    read_html(url) %>>% 
        html_nodes('.tocArticle')  %>>% 
        html_nodes('.tocTitle')  %>>% 
        html_nodes('a') %>>% 
        html_attr('href') 
}

getPapersInfo <- function(url) {

    print(url)

    artigo <- read_html(url)

    artigo %>>% 
        html_nodes(xpath = '//*[@id="articleTitle"]') %>>%
        html_text %>>% 
        (. -> articleTitle)

    artigo %>>% 
        html_nodes(xpath = '//*[@id="authorString"]') %>>%
        html_text %>>% 
        (. -> authorString)

    artigo %>>% 
        html_nodes(xpath = '//*[@id="articleAbstract"]') %>>%
        html_text %>>% 
        (. -> articleAbstract)

    artigo %>>% 
        html_nodes(xpath = '//*[@id="articleSubject"]') %>>%
        html_text %>>% 
        (. -> articleSubject)
    articleSubject <- ifelse(length(articleSubject) == 0, 'No Keywords', articleSubject)

    artigo %>>% 
        html_nodes(xpath = '//*[@id="pub-id::doi"]') %>>%
        html_text %>>% 
        (. -> doi)
    doi <- ifelse(length(doi) == 0, 'No DOI', doi)

    artigo %>>% 
        html_nodes(xpath = '//*[@id="breadcrumb"]') %>>%
        html_text %>>%   
        (. -> issue)

    data.frame(articleTitle, 
               authorString, 
               articleAbstract, 
               articleSubject, 
               doi,
               issue) %>>% 
    tbl_df

}
wos-downloader.R
Código
library(rio)
library(pipeR)
library(tidyverse)
library(httr)
library(xml2)
library(rvest)

source('utils.R')

# ------------------------------------------------------------
# Análise Econômica

url = 'https://seer.ufrgs.br/AnaliseEconomica/issue/archive'

# ------------------------------
# getIssues
issues <- getIssues(url)

# ------------------------------
# getSummary
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)

# ------------------------------
# getPapersInfo
papers <- lapply(summaries[1:5],getPapersInfo) 

papers %>>% 
    bind_rows() %>>% 
    glimpse
    (export(.,"AnaliseEconomica.rds"))


# ------------------------------------------------------------
# Brazilian Review of Econometrics

url = 'http://bibliotecadigital.fgv.br/ojs/index.php/bre/issue/archive'
issues <- getIssues(url)
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)
papers <- lapply(summaries,getPapersInfo) 
papers %>>% 
    bind_rows() %>>% 
    (export(.,"BrazilianReviewEconometrics.rds"))

getPapersInfo(summaries[[1]]) %>>% glimpse()

# ------------------------------------------------------------
# Revista de Economia Contemporânea
url = 'https://revistas.ufrj.br/index.php/rec/issue/archive'
issues <- getIssues(url)
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)
papers <- lapply(summaries,getPapersInfo) 
papers %>>% 
    bind_rows() %>>% 
    (export(.,"RevistaEconomiaContemporanea.rds"))


# ------------------------------------------------------------
# Brazilian Review of Finance

url = 'http://bibliotecadigital.fgv.br/ojs/index.php/rbfin/issue/archive'
issues <- getIssues(url)
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)
papers <- lapply(summaries,getPapersInfo) 
papers %>>% 
    bind_rows() %>>% 
    (export(.,"BrazilianReviewFinance.rds"))


# ------------------------------------------------------------
# Economia e Sociedade

# TODO

url = 'https://periodicos.sbu.unicamp.br/ojs/index.php/ecos/issue/archive'
issues <- getIssues(url)
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)
papers <- lapply(summaries,getPapersInfo) 
papers %>>% 
    bind_rows() %>>% 
    (export(.,"EconomiaSociedade.rds"))


# ------------------------------------------------------------
# Estudos Economicos

# TODO

url = 'http://www.revistas.usp.br/ee/issue/archive'
issues <- getIssues(url)
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)
papers <- lapply(summaries,getPapersInfo) 
papers %>>% 
    bind_rows() %>>% 
    (export(.,"EstudosEconomicos.rds")


# ------------------------------------------------------------
# Nova Economia

# TODO

url = 'https://revistas.face.ufmg.br/index.php/novaeconomia/issue/archive'
issues <- getIssues(url)
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)
papers <- lapply(summaries,getPapersInfo) 
papers %>>% 
    bind_rows() %>>% 
    (export(.,"NovaEconomia.rds")


# ------------------------------------------------------------
# Pesquisa e Planejamento Econômico

# TODO

url = 'http://ppe.ipea.gov.br/index.php/ppe/issue/archive'
issues <- getIssues(url)
summaries <- lapply(issues,getSummaries)
summaries <- unlist(summaries)
papers <- lapply(summaries,getPapersInfo) 
papers %>>% 
    bind_rows() %>>% 
    (export(.,"PesquisaPlanejamentoEconomico.rds")