Aim

Download the files

Description

All the data files and the description file are available on http://base-donnees-publique.medicaments.gouv.fr/telechargement.php.

Data files are .txt

Description file is .pdf

I will use httr to download all of them.

Implementation

url <- "http://base-donnees-publique.medicaments.gouv.fr/telechargement.php"

Get the page

library(rvest)
## Loading required package: xml2
library(XML)
## Loading required package: methods
## 
## Attaching package: 'XML'
## The following object is masked from 'package:rvest':
## 
##     xml
page <- read_html(url)

Databases URL could be find with files ending by .txt.

These url are in link in a list (ul)

dl_end_url <- page %>%
  html_nodes("#container ul a") %>%
  html_attr("href")

dl_end_url
##  [1] "docs/Contenu_et_format_des_fichiers_telechargeables_dans_la_BDM_v1.pdf"
##  [2] "?fichier=CIS_bdpm.txt"                                                 
##  [3] "?fichier=CIS_CIP_bdpm.txt"                                             
##  [4] "?fichier=CIS_COMPO_bdpm.txt"                                           
##  [5] "?fichier=CIS_HAS_SMR_bdpm.txt"                                         
##  [6] "?fichier=CIS_HAS_ASMR_bdpm.txt"                                        
##  [7] "?fichier=HAS_LiensPageCT_bdpm.txt"                                     
##  [8] "?fichier=CIS_GENER_bdpm.txt"                                           
##  [9] "?fichier=CIS_CPD_bdpm.txt"                                             
## [10] "?fichier=CIS_InfoImportantes.txt"

We don’t want to download the documentation for the moment

# avoid the PDF
data_end_url <- dl_end_url[-grep("pdf", dl_end_url)]

To get the download URL, just paste the base url.

full_url <- function(endurl, base = url) {
  paste0(base, endurl)
}

filename <- function(endurl) {
  gsub("\\?fichier=", "", x= endurl)
}

We will download all the raw data in a directory

raw_data_dir <- "raw_data"

if(!dir.exists(raw_data_dir)) dir.create(raw_data_dir)

# If the raw data was not already downloaded
if(length(dir(raw_data_dir)) == 0) {
lapply(data_end_url, 
       function(x) 
         download.file(
           url = full_url(x), 
           destfile = file.path(raw_data_dir, filename(x))
           )
  )
  
}

List the downloaded files

datafiles <- dir(raw_data_dir)
sizes <- file.size(file.path(raw_data_dir, datafiles))

kable(data.frame("File name" = datafiles, "Size (Mo)" = round(sizes / 10^6, 1), check.names = F))
File name Size (Mo)
CIS_bdpm.txt 2.7
CIS_CIP_bdpm.txt 4.0
CIS_COMPO_bdpm.txt 2.1
CIS_CPD_bdpm.txt 0.7
CIS_GENER_bdpm.txt 1.0
CIS_HAS_ASMR_bdpm.txt 1.6
CIS_HAS_SMR_bdpm.txt 2.4
CIS_InfoImportantes.txt 6.5
HAS_LiensPageCT_bdpm.txt 0.4