MySQL for CSG from within R

$Date: 2005/05/13 14:37:46 $
$Revision: 1.1 $

Getting Started

Before you can access an SQL database in R, you need to load a couple libraries:
> library(RMySQL)	# this will also load DBI 
Loading required package: DBI
and then establish a connection to the database:
>  con <- dbConnect(dbDriver("MySQL"),dbname="MYDBNAME",host="DBHOST",username="DBUID",password="UIDPW")
Notice the use of a password in the command above. A better way to connect that protects your password is described in the next section. The R variable con is a "connection" to the database MYDBNAME.
> con
 

> attributes(con)
$Id
[1] 15295     1

$class
[1] "MySQLConnection"
attr(,"package")
[1] "RMySQL"

> summary(con)
 
  User: DBUID 
  Host: DBHOST 
  Dbname: MYDBNAME 
  Connection type: DBHOST via TCP/IP 
  No resultSet available
Queries sent to the connection will be passed along to the the mysql server and returned to R, usually in the form of a data.frame.

A Better Way to Get Started

Notice that in the example above you need to send a password. Since you probably don't want your password sitting around in script files, there is another way to get authenticated.

Make a file called $HOME/.my.cnf with the following format (see the RMySQL documentation for details):

[MYDBNAME]
user=DBUID
host=DBHOST
database=MYDBNAME
password=UIDPW
Make this file READABLE ONLY BY YOU (use chmod go-wrx). You can multiple sections in this file, one for each combination of user/database/host that you use. The name of the section (enclosed in the square brackets can be anything you like. Once you have done this, you can establish a connection as shown below:
>  con2 <- dbConnect(dbDriver("MySQL"),group="MYDBNAME")

Querying the Database

SQL queries can now be made using the R command dbGetQuery. Responses are stored in a data.frame, which can (and in most cases should) be stored in an R variable. The ending semicolon appears to be optional.
> dbGetQuery(con,"show tables") -> tables
> dim(tables)
[1] 34  1
> tables
   Tables_in_MYDBNAME
1           birthw
2         geograpa
3    gmap_county_l
4    gmap_gparents
5     gmap_parents
6       gmap_probs
7         kf_album
8          kf_cntr
9     kf_discharge
10          kf_f01
11         kf_f01b
12          kf_f02
13          kf_f03
14         kf_f03b
15          kf_f04
16          kf_f05
17          kf_f06
18          kf_f07
19         kf_f07b
20          kf_f08
21        kf_f08lu
22         kf_f08x
23          kf_f09
24        kf_f09lu
25         kf_f09x
26         kf_fams
27        kf_lipid
28        kf_notes
29          kf_ped
30           kf_rx
31          ladata
32       phenovars
33         subsets
34         summary
The previous information can also be obtained as a vector using the function dbListTables.
> dbListTables(con)
 [1] "birthw"        "geograpa"      "gmap_county_l" "gmap_gparents"
 [5] "gmap_parents"  "gmap_probs"    "kf_album"      "kf_cntr"      
 [9] "kf_discharge"  "kf_f01"        "kf_f01b"       "kf_f02"       
[13] "kf_f03"        "kf_f03b"       "kf_f04"        "kf_f05"       
[17] "kf_f06"        "kf_f07"        "kf_f07b"       "kf_f08"       
[21] "kf_f08lu"      "kf_f08x"       "kf_f09"        "kf_f09lu"     
[25] "kf_f09x"       "kf_fams"       "kf_lipid"      "kf_notes"     
[29] "kf_ped"        "kf_rx"         "ladata"        "phenovars"    
[33] "subsets"       "summary"    
There are a number of other utility functions of this sort that will likely be useful for writing scripts and for dealing with queries that return a large amount of data, but the remainder of this document will focus on dbGetQuery.

Sample Session With RMySQL

> dbGetQuery(con,"desc kf_f05") 
          Field        Type Null Key Default Extra
1       studyid  varchar(8)  YES              
2      memberid     char(3)  YES              
3      rstudyid  varchar(8)  YES              
4           sex     char(1)  YES              
5      relation     int(11)  YES              
6       relinfo varchar(60)  YES              
7        street varchar(30)  YES              
8           zip  varchar(5)  YES              
9          city varchar(12)  YES              
10          pop     char(3)  YES              
11          mom     char(3)  YES              
12     deceased     char(1)  YES              
13     deathage     char(3)  YES              
14   deathcause varchar(30)  YES              
15    hrtattack     char(1)  YES              
16     hrtakage     char(3)  YES              
17    cerstroke     char(1)  YES              
18   cerstrkage     char(3)  YES              
19   hypertensn     char(1)  YES              
20   hyprtnsage     char(3)  YES              
21     diabetes     char(1)  YES              
22   diabetsage     char(3)  YES              
23      insulin     char(1)  YES              
24        gstdm     char(1)  YES              
25     gstdmage     char(3)  YES              
26        other     char(1)  YES              
27     diseases varchar(50)  YES              
28       partic     char(1)  YES              
29     savedate        date  YES              
30 activerecord     int(11)  YES              
31   activedate  varchar(8)  YES              
32   activetime  varchar(8)  YES              
33   activeuser  varchar(8)  YES              

> dbGetQuery(con,"select studyid,sex,deceased,deathage from kf_f05") -> df 

> df[1:8,]
   studyid sex deceased deathage
1 0305-100   M        1      078
2 0305-100   F        1      078
3 0305-100   F        2     
4 0305-100   F        2     
5 0305-100   F        2     
6 0305-100   F        2     
7 0305-100   M        2     
8 0305-100   M        2     

> is.numeric(df$deathage)
[1] FALSE

> table(df$sex)

   .    F    M 
   1 7766 7804 
> table(df$deceased)

   1    2    3 
5647 8741   22 

> table(as.numeric(df$deathage))

  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19 
209  46  42  33  16  18  16  20  12   4   8   1   8   7  10   7   7  12  17  20 
 20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35  36  37  38  39 
 31  42  29  22  26   7  13  18  12  22  29  13  12  13  14  30  24  10  27  30 
 40  41  42  43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59 
 44  13  51  22  35  35  29  30  43  53  70  41  58  56  71  67  61  58 100  75 
 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78  79 
101  81 130  84 101 126  89 130  91 115 147  86 148 117  97 144 104  93  89  96 
 80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98 101 
134  56 100  76  64  54  56  42  48  50  16   9  19  13   5   1   2   1   1   1 
102 104 106 
  3   2   1 

> library(lattice)
> histogram(~as.numeric(deathage),df)
> library(lattice)
> png("deathage.png")
> trellis.par.set(theme=col.whitebg())
> histogram(~as.numeric(deathage),df)
> dev.off()
Here is the histogram produced above.
histogram of death age

Scripting and Large Queries

There appear to be cleverer ways to do some of this than using dbGetQuery. Those planning on writing scripts and/or selecting large amounts of data from a database should probably read the RMySQL documentation first. Perhaps at some later date these notes will contain information about this.