Connect WRDS in R
Connect from R to Wharton Research Data Services
to set up connection from R to WRDS (here)
library(RPostgres)
library(tidyverse)
# I've set up wrds connection before hand.
# Please use your username and password here.
wrds <- dbConnect(
Postgres(),
host = 'wrds-pgdata.wharton.upenn.edu',
port = 9737,
dbname = 'wrds',
sslmode = 'require',
user = Sys.getenv("wrds_user"),
pass = Sys.getenv("wrds_pass")
)
knitr::opts_chunk$set(message = FALSE)
Example
Check variables (column headers) in COMP ANNUAL FUNDAMENTAL
res <- dbSendQuery(
wrds,
"select column_name
from information_schema.columns
where table_schema='compa'
and table_name='funda'
order by column_name"
)
# data <- dbFetch(res, n=-1)
data <- dbFetch(res, n = 10)
dbClearResult(res) # closes the connection
head(data)
## column_name
## 1 acchg
## 2 acco
## 3 accrt
## 4 acctchg
## 5 acctstd
## 6 acdo
Overview
All data libraries available at WRDS
res <- dbSendQuery(wrds, "select distinct table_schema
from information_schema.tables
order by table_schema")
all_lib <- dbFetch(res, n=-1)
dbClearResult(res)
head(all_lib)
## table_schema
## 1 aha
## 2 aha_hcris_3years
## 3 aha_hcris_3years_old
## 4 aha_hcris_recent
## 5 aha_hcris_recent_old
## 6 aha_it_survey_3years
datasets within a given library
res <- dbSendQuery(wrds, "select distinct table_name
from information_schema.columns
where table_schema='comp_na_daily_all'
order by table_name")
df_in_lib <- dbFetch(res, n=-1)
dbClearResult(res)
head(df_in_lib)
## table_name
## 1 aco_amda
## 2 aco_imda
## 3 aco_indfnta
## 4 aco_indfntq
## 5 aco_indfntytd
## 6 aco_indsta
variables within a given dataset
res <- dbSendQuery(wrds, "select column_name
from information_schema.columns
where table_schema='comp_na_daily_all'
and table_name='funda'
order by column_name")
variables <- dbFetch(res, n=-1)
dbClearResult(res)
head(variables)
## column_name
## 1 acchg
## 2 acco
## 3 accrt
## 4 acctchg
## 5 acctstd
## 6 acdo
Specific Example
Get advertising and R&D data
# from comp_na_daily_all.funda
res <- dbSendQuery(wrds, "select gvkey, fyear, xad, xrd
from comp_na_daily_all.funda
where fyear between 2000 and 2010
and xad is not null
and xrd is not null")
# res <- dbSendQuery(wrds, "select gvkey, fyear
# from comp_na_daily_all.funda
# where fyear between 2000 and 2020")
data <- dbFetch(res, n=10) |>
# data <- dbFetch(res, n=-1) |>
unique()
dbClearResult(res)
head(data)
## gvkey fyear xad xrd
## 1 001050 2000 0.188 0.140
## 2 001084 2000 1.711 0.036
## 3 001104 2000 3.170 0.046
## 4 001111 2000 16.500 41.396
## 5 001117 2000 0.161 1.175
## 6 001161 2000 148.000 641.799
Get industry data
In WRDS, the industry classifications visible on the web version differ from those in the code version. This discrepancy arises because the web dataset undergoes (behind the scene) a merging process where different variables are combined to produce the final table. Consequently, in the library dataset comp_na_daily_all.funda
, key classifications such as gind
, gsubind
, naics
, and sic
are absent. Instead, it includes naics historical and sic historical, which often contain incomplete data. To obtain a complete view, users must merge this dataset with the industry dataset.
# get industry data
res <- dbSendQuery(wrds,
"select gvkey, gind, gsubind, naics, sic
from comp_na_daily_all.names")
ind <- dbFetch(res, n = -1)
dbClearResult(res)
head(ind)
## gvkey gind gsubind naics sic
## 1 001000 <NA> <NA> <NA> 3089
## 2 001001 253010 25301040 722 5812
## 3 001002 <NA> <NA> <NA> 3825
## 4 001003 255040 25504040 442110 5712
## 5 001004 201010 20101010 423860 5080
## 6 001005 <NA> <NA> <NA> 3724
df <- data |>
left_join(ind)
head(df)
## gvkey fyear xad xrd gind gsubind naics sic
## 1 001050 2000 0.188 0.140 202010 20201050 333413 3564
## 2 001084 2000 1.711 0.036 502030 50203010 519290 7370
## 3 001104 2000 3.170 0.046 202010 20201060 332215 3420
## 4 001111 2000 16.500 41.396 451030 45103030 511210 7372
## 5 001117 2000 0.161 1.175 452010 45201020 334220 3663
## 6 001161 2000 148.000 641.799 453010 45301020 334413 3674