Baixar lista de patentes a serem raspadas no Google Patents:
TI=(biogas) OR AB=(biogas) OR CL=(biogas) country:BR
e salvá-las como: gp-search-20230830-124212.csv
Função de raspagem: sniff_google_patents
code/raspagem-google-patents/utils.R
sniff_google_patents <- function(url, show_progress = F) {
if (show_progress == T) print(paste(format(Sys.time(), '%Y-%m-%d %H:%M:%S'), url, sep = ' '))
## Firefox: Inspect Browser with Disabled JavaScript
# - Enter about:config into the search bar and select Accept the Risk and Continue.
# - Enter javascript.enabled into the search box at the top of the page.
# - Select the javascript.enabled toggle to change the value to false.
# -----
# abstract
rvest::read_html(url) |>
rvest::html_element('xpath' = '//abstract') ->
abstract_path
if (class(abstract_path) == 'xml_node') {
abstract_path |>
rvest::html_text2() ->
abstract
} else { abstract <- NA }
# -----
# claims
rvest::read_html(url) |>
rvest::html_element('xpath' = '//claims') ->
claims_path
if (class(claims_path) == 'xml_node') {
claims_path |>
rvest::html_text2() ->
claims
} else { claims <- NA }
# -----
# Cited By: sem javascript
# Families Citing this family: com javascript
rvest::read_html(url) |>
rvest::html_element('xpath' = '//*[contains (text(), "Families Citing this family")]//following::table/thead') ->
cited_by_path
if (class(cited_by_path) == 'xml_node') {
cited_by_path |>
rvest::html_table() |>
janitor::clean_names() ->
thead_cited_by
rvest::read_html(url) |>
rvest::html_element('xpath' = '//*[contains (text(), "Families Citing this family")]//following::table/tbody') |>
rvest::html_table() |>
setNames(names(thead_cited_by)) |>
dplyr::mutate(publication_number = gsub('\n.*$', '', publication_number)) ->
cited_by
} else { cited_by <- NA }
# -----
# Family Cites Families: sem javascript
# Patent Citations: com javascript
rvest::read_html(url) |>
rvest::html_element('xpath' = '//*[contains (text(), "Family Cites Families")]//following::table/thead') ->
patent_citations_path
if (class(patent_citations_path) == 'xml_node') {
patent_citations_path |>
rvest::html_table() |>
janitor::clean_names() ->
thead_patent_citations
rvest::read_html(url) |>
rvest::html_element('xpath' = '//*[contains (text(), "Family Cites Families")]//following::table/tbody') |>
rvest::html_table() |>
setNames(names(thead_patent_citations)) |>
dplyr::mutate(publication_number = gsub('\n.*$', '', publication_number)) ->
patent_citations
} else { patent_citations <- NA }
# -----
# Classifications: sem javascript
rvest::read_html(url) |>
rvest::html_element('xpath' = '//*[contains (text(), "Classifications")]//following::*') |>
as.character() ->
ipc_text
if (!is.null(ipc_text)) {
stringr::str_locate_all(ipc_text, '[A-Z]{1}[0-9]{2}[A-Z]{1}[0-9]{2}\\/[0-9]{2}') |>
{\(x) stringr::str_sub(ipc_text, x[[1]]) }() ->
ipc
} else { ipc_code <- NA }
# -----
list(url = url,
abstract = abstract,
claims = claims,
cited_by = cited_by,
patent_citations = patent_citations,
ipc = ipc)
}
Raspagem
Código
library(tidyverse)
library(stringr)
library(purrr)
library(furrr)
library(tictoc)
# carregar função sniff_google_patent
source('code/raspagem-google-patents/utils.R')
# importar lista de patentes a serem raspadas
readr::read_csv('code/raspagem-google-patents/gp-search-20230830-124212.csv', skip = 1) |>
tibble::tibble() |>
janitor::clean_names() |>
dplyr::mutate(id = gsub('[[:punct:]]', '', id)) |>
dplyr::mutate(id = stringr::str_trim(id)) ->
pat
dplyr::glimpse(pat)
Rows: 511
Columns: 10
$ id <chr> "BR112018012788B1", "BR112012028650B1", "BR…
$ title <chr> "METHOD AND FACILITY TO PRODUCE BIOMETHANE …
$ assignee <chr> "Waga Energy", "Solvay Sa", "Herbst Umweltt…
$ inventor_author <chr> "Guénaël PRINCE, Mathieul Lefebvre, Pierre …
$ priority_date <date> 2015-12-24, 2010-05-10, 2017-01-30, 2005-0…
$ filing_creation_date <date> 2016-11-10, 2011-05-10, 2018-01-29, 2006-0…
$ publication_date <date> 2022-12-13, 2019-04-30, 2020-03-17, 2011-0…
$ grant_date <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ result_link <chr> "https://patents.google.com/patent/BR112018…
$ representative_figure_link <chr> "https://patentimages.storage.googleapis.co…
Código
# executando função: sniff_google_patents
tictoc::tic()
purrr::map(pat$result_link, sniff_google_patents, .progress = T) -> gp_raspado
tictoc::toc() # 54 minutos
# adicionar id das patentes
names(gp_raspado) <- pat$id
54 minutos utilizando 1 núcleo.
Raspagem multiprocessada
Multiprocessando utilizando 28 núcleos.
Código
# configurando o multiprocessamento
library(furrr)
plan(multisession, workers = 28)
# executando função: sniff_google_patents
tictoc::tic()
furrr::future_map(pat$result_link, sniff_google_patents, .progress = T) -> gp_raspado2
tictoc::toc() # 58 segundos
# adicionar id das patentes
names(gp_resultado) <- pat$id
58 segundos utilizando 28 núcleos.