Scraping Wikipedia with R to make a list and a data frame

Question 1

A possibility is to use regular expressions. I've never done that with R but the library stringr seems to be recommended: Extract a regular expression match in R version 2.10 ( http://cran.r-project.org/web/packages/stringr/stringr.pdf )

EDIT: Code that appears to work for me

library(XML)
library(RCurl)
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
library(stringr)

path<-"https://fr.wikipedia.org/wiki/Jeux_olympiques_d%27hiver_de_2010"
webpage <- getURL(path)
webpage <- readLines(tc <- textConnection(webpage)); close(tc)

pagetree <- htmlTreeParse(webpage, error=function(...){}, useInternalNodes = TRUE, encoding = "UTF-8")
# Extract table header and contents
tablehead <- xpathSApply(pagetree, "//*/table/tr", xmlValue)
country<-tablehead[31]

country<-strsplit(country,"\n")

# extract country
bar <- function(x) str_trim(str_extract(x, "[^(]*"), side = "both")
res1 <- sapply(country[[1]], bar)    
# extract nb of athletes
foo <- function(x) str_trim(str_match(x, "\\((.*?)\\)")[[2]], side = "both")
res2 <- sapply(country[[1]], foo)
# build df
res2 <- as.numeric(res2)
df <- data.frame(res1, res2)
df <- df[res1 != "",]
# inspect df
nrow(df)
summary(df)

Question 2

Try

library(plyr)
country <- str_split(country,"\n")[[1]]
df <- ldply(country[[1]], function(z) data.frame(str_extract(z, "[A-Za-z]+")[[1]], str_extract(z, "[0-9]+")))
head(na.omit(df))

                                  a                        b
2                           Afrique                        2
3                           Albanie                        1
4                               Alg                        1
5                         Allemagne                      153
6                           Andorre                        6
7                         Argentine                        7