Connect WRDS in R

Connect from R to Wharton Research Data Services

to set up connection from R to WRDS (here)

library(RPostgres)
library(tidyverse)


# I've set up wrds connection before hand. 
# Please use your username and password here.
wrds <- dbConnect(
  Postgres(),
  host = 'wrds-pgdata.wharton.upenn.edu',
  port = 9737,
  dbname = 'wrds',
  sslmode = 'require',
  user = Sys.getenv("wrds_user"),
  pass = Sys.getenv("wrds_pass")
)
knitr::opts_chunk$set(message = FALSE)

Example

Check variables (column headers) in COMP ANNUAL FUNDAMENTAL

res <- dbSendQuery(
  wrds,
  "select column_name
                   from information_schema.columns
                   where table_schema='compa'
                   and table_name='funda'
                   order by column_name"
)
# data <- dbFetch(res, n=-1) 
data <- dbFetch(res, n = 10)
dbClearResult(res) # closes the connection
head(data)
##   column_name
## 1       acchg
## 2        acco
## 3       accrt
## 4     acctchg
## 5     acctstd
## 6        acdo

Overview

All data libraries available at WRDS

res <- dbSendQuery(wrds, "select distinct table_schema
                   from information_schema.tables
                   order by table_schema")
all_lib <- dbFetch(res, n=-1)
dbClearResult(res)
head(all_lib)
##           table_schema
## 1                  aha
## 2     aha_hcris_3years
## 3 aha_hcris_3years_old
## 4     aha_hcris_recent
## 5 aha_hcris_recent_old
## 6 aha_it_survey_3years

datasets within a given library

res <- dbSendQuery(wrds, "select distinct table_name
                   from information_schema.columns
                   where table_schema='comp_na_daily_all'
                   order by table_name")
df_in_lib <- dbFetch(res, n=-1)
dbClearResult(res)
head(df_in_lib)
##      table_name
## 1      aco_amda
## 2      aco_imda
## 3   aco_indfnta
## 4   aco_indfntq
## 5 aco_indfntytd
## 6    aco_indsta

variables within a given dataset

res <- dbSendQuery(wrds, "select column_name
                   from information_schema.columns
                   where table_schema='comp_na_daily_all'
                   and table_name='funda'
                   order by column_name")
variables <- dbFetch(res, n=-1)
dbClearResult(res)
head(variables)
##   column_name
## 1       acchg
## 2        acco
## 3       accrt
## 4     acctchg
## 5     acctstd
## 6        acdo

Specific Example

Get advertising and R&D data

# from comp_na_daily_all.funda
res <- dbSendQuery(wrds, "select gvkey, fyear, xad, xrd
                   from comp_na_daily_all.funda
                   where fyear between 2000 and 2010 
                     and xad is not null 
                     and xrd is not null")

# res <- dbSendQuery(wrds, "select gvkey, fyear
#                    from comp_na_daily_all.funda
#                    where fyear between 2000 and 2020")

data <- dbFetch(res, n=10) |> 
# data <- dbFetch(res, n=-1) |> 
    unique()
dbClearResult(res)
head(data)
##    gvkey fyear     xad     xrd
## 1 001050  2000   0.188   0.140
## 2 001084  2000   1.711   0.036
## 3 001104  2000   3.170   0.046
## 4 001111  2000  16.500  41.396
## 5 001117  2000   0.161   1.175
## 6 001161  2000 148.000 641.799

Get industry data

In WRDS, the industry classifications visible on the web version differ from those in the code version. This discrepancy arises because the web dataset undergoes (behind the scene) a merging process where different variables are combined to produce the final table. Consequently, in the library dataset comp_na_daily_all.funda, key classifications such as gind, gsubind, naics, and sic are absent. Instead, it includes naics historical and sic historical, which often contain incomplete data. To obtain a complete view, users must merge this dataset with the industry dataset.

# get industry data
res <- dbSendQuery(wrds,
                   "select gvkey, gind, gsubind, naics, sic
                   from comp_na_daily_all.names")
ind <- dbFetch(res, n = -1)
dbClearResult(res)
head(ind)
##    gvkey   gind  gsubind  naics  sic
## 1 001000   <NA>     <NA>   <NA> 3089
## 2 001001 253010 25301040    722 5812
## 3 001002   <NA>     <NA>   <NA> 3825
## 4 001003 255040 25504040 442110 5712
## 5 001004 201010 20101010 423860 5080
## 6 001005   <NA>     <NA>   <NA> 3724
df <- data |> 
    left_join(ind)
head(df)
##    gvkey fyear     xad     xrd   gind  gsubind  naics  sic
## 1 001050  2000   0.188   0.140 202010 20201050 333413 3564
## 2 001084  2000   1.711   0.036 502030 50203010 519290 7370
## 3 001104  2000   3.170   0.046 202010 20201060 332215 3420
## 4 001111  2000  16.500  41.396 451030 45103030 511210 7372
## 5 001117  2000   0.161   1.175 452010 45201020 334220 3663
## 6 001161  2000 148.000 641.799 453010 45301020 334413 3674
Mike Nguyen, PhD
Mike Nguyen, PhD
Visiting Professor of Data Sciences and Operations

My research interests include marketing, and social science.

Next
Previous

Related