Preparing Manifesto Project data using quanteda

options(width=120)

The file CSV-files in folder “Manifesto Project” were downloaded from the Manifesto Project website. Redistribution of the data is prohibited, so readers who want to preproduce the following will need to download their own copy of the data set and upload it to the virtual machine that runs this notebook. To do this,

  1. pull down the “File” menu item and select “Open”
  2. An overview of the folder that contains the notebook opens.
  3. The folder view has a button labelled “Upload”. Use this to upload the file that you downloaded from the Manifesto Project website.

Note that the uploaded data will disappear, once you “Quit” the notebook (and the Jupyter instance).

csv.files <- dir("ManifestoProject",
                 full.names=TRUE,
                 pattern="*.csv")
length(csv.files)
[1] 14

The package readtext (a companion package for quanteda) is somewhat better able to deal with the Manfisto Project CSV files than tm. It is available from CRAN. You may need to install it the code install.packages("quanteda") if you want to run this on your computer. (The package is already installed on the notebook container, however.)

library(readtext)
UKLib.rt <- readtext("ManifestoProject/*.csv",
               text_field=1,
               docvarsfrom="filenames",
               docvarnames=c("party","date"))
nrow(UKLib.rt)
[1] 4228

Here we create an index of documents in the corpus:

UKLib.rta <- aggregate(text~party+date,
                       FUN=function(x)paste(x,collapse=" "),
                       data=UKLib.rt)
nrow(UKLib.rta)
[1] 14
UKLib.rta <- within(UKLib.rta,
              doc_id <- paste(party,date,sep="_"))

The following makes use of the quanteda package. You may need to install it from CRAN using the code install.packages("quanteda") if you want to run this on your computer. (The package is already installed on the notebook container, however.)

library(quanteda)
Package version: 3.2.1
Unicode version: 13.0
ICU version: 67.1
Parallel computing: 20 of 20 threads used.
See https://quanteda.io for tutorials and examples.
UKLib.corpus <- corpus(UKLib.rta)
UKLib.corpus
Corpus consisting of 14 documents and 2 docvars.
51420_196410 :
"""THINK FOR YOURSELF""  The Liberal Party offers the elector..."

51420_196603 :
"For All the People: the Liberal Plan of 1966  BRITAIN DEMAND..."

51420_197006 :
"What a Life!  There must surely be a better way to run a cou..."

51420_197402 :
"'Change the face of Britain'  THE CRISIS OF GOVERNMENT  This..."

51420_197410 :
"Why Britain Needs Liberal Government  A PERSONAL MESSAGE FRO..."

51420_197905 :
"'The Real Fight is for Britain'  INTRODUCTION  With your sup..."

[ reached max_ndoc ... 8 more documents ]

Here we combine metadata with the text documents:

manifesto.metadata <- read.csv("documents_MPDataset_MPDS2019b.csv",stringsAsFactors=FALSE)
str(manifesto.metadata)
'data.frame':	4492 obs. of  6 variables:
 $ country    : int  11 11 11 11 11 11 11 11 11 11 ...
 $ countryname: chr  "Sweden" "Sweden" "Sweden" "Sweden" ...
 $ party      : int  11110 11110 11110 11110 11110 11110 11110 11110 11110 11220 ...
 $ partyname  : chr  "Green Ecology Party" "Green Ecology Party" "Green Ecology Party" "Green Ecology Party" ...
 $ date       : int  198809 199109 199409 199809 200209 200609 201009 201409 201809 194409 ...
 $ title      : chr  "Valmanifest" "Valmanifest ‘91" "Valmanifest" "Valmanifest 98" ...
docvars(UKLib.corpus) <- merge(docvars(UKLib.corpus),
                               manifesto.metadata,
                               by=c("party","date"))
str(docvars(UKLib.corpus))
'data.frame':	14 obs. of  6 variables:
 $ party      : int  51420 51420 51420 51420 51420 51420 51420 51420 51421 51421 ...
 $ date       : int  196410 196603 197006 197402 197410 197905 198306 198706 199204 199705 ...
 $ country    : int  51 51 51 51 51 51 51 51 51 51 ...
 $ countryname: chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
 $ partyname  : chr  "Liberal Party" "Liberal Party" "Liberal Party" "Liberal Party" ...
 $ title      : chr  "Think for Yourself - Vote Liberal" "For all the People: The Liberal Plan of 1966" "What a Life!" "Change the Face of Britain" ...

Finally we create a document-feature matrix, without punctuation, numbers, symbols and stopwords:

UKLib.dfm <- dfm(UKLib.corpus,
                 remove_punct=TRUE,
                 remove_numbers=TRUE,
                 remove_symbols=TRUE,
                 remove=stopwords("english"),
                 stem=TRUE)
str(docvars(UKLib.dfm))
Warning:
'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warning:
'...' should not be used for tokens() arguments; use 'tokens()' first.
Warning:
'remove' is deprecated; use dfm_remove() instead
Warning:
'stem' is deprecated; use dfm_wordstem() instead
'data.frame':	14 obs. of  6 variables:
 $ party      : int  51420 51420 51420 51420 51420 51420 51420 51420 51421 51421 ...
 $ date       : int  196410 196603 197006 197402 197410 197905 198306 198706 199204 199705 ...
 $ country    : int  51 51 51 51 51 51 51 51 51 51 ...
 $ countryname: chr  "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
 $ partyname  : chr  "Liberal Party" "Liberal Party" "Liberal Party" "Liberal Party" ...
 $ title      : chr  "Think for Yourself - Vote Liberal" "For all the People: The Liberal Plan of 1966" "What a Life!" "Change the Face of Britain" ...

A more fine-grained control is possible using tokens():

UKLib.toks <- tokens(UKLib.corpus,
                     remove_punct=TRUE,
                     remove_numbers=TRUE)
UKLib.toks
Tokens consisting of 14 documents and 6 docvars.
51420_196410 :
 [1] "THINK"         "FOR"           "YOURSELF"      "The"           "Liberal"       "Party"         "offers"       
 [8] "the"           "electorate"    "a"             "radical"       "non-Socialist"
[ ... and 8,853 more ]

51420_196603 :
 [1] "For"     "All"     "the"     "People"  "the"     "Liberal" "Plan"    "of"      "BRITAIN" "DEMANDS" "A"      
[12] "NEW"    
[ ... and 31,786 more ]

51420_197006 :
 [1] "What"   "a"      "Life"   "There"  "must"   "surely" "be"     "a"      "better" "way"    "to"     "run"   
[ ... and 23,962 more ]

51420_197402 :
 [1] "Change"     "the"        "face"       "of"         "Britain"    "THE"        "CRISIS"     "OF"        
 [9] "GOVERNMENT" "This"       "country"    "has"       
[ ... and 13,764 more ]

51420_197410 :
 [1] "Why"        "Britain"    "Needs"      "Liberal"    "Government" "A"          "PERSONAL"   "MESSAGE"   
 [9] "FROM"       "THE"        "RT"         "HON"       
[ ... and 10,485 more ]

51420_197905 :
 [1] "The"          "Real"         "Fight"        "is"           "for"          "Britain"      "INTRODUCTION"
 [8] "With"         "your"         "support"      "this"         "election"    
[ ... and 11,438 more ]

[ reached max_ndoc ... 8 more documents ]
UKLib.dfm <- dfm(UKLib.toks)
UKLib.dfm
UKLib.dfm <- dfm_remove(UKLib.dfm,
                        pattern=stopwords("english"))
UKLib.dfm
UKLib.dfm <- dfm_wordstem(UKLib.dfm,language="english")
UKLib.dfm

quanteda provides support for dictionaries:

milecondict <- dictionary(list(
                Military=c("military","forces","war","defence","victory","victorious","glory"),
                Economy=c("economy","growth","business","enterprise","market")
))

Here we extract the frequency of tokens belonging to certain dictionaries:

UKLib.milecon.dfm <- dfm(UKLib.corpus,
                         dictionary=milecondict)
UKLib.milecon.dfm
Warning:
'dfm.corpus()' is deprecated. Use 'tokens()' first.
Warning:
'dictionary' and 'thesaurus' are deprecated; use dfm_lookup() instead
time <- with(docvars(UKLib.milecon.dfm),
             ISOdate(year=date%/%100,
                     month=date%%100,
                     day=1))
time
 [1] "1964-10-01 12:00:00 GMT" "1966-03-01 12:00:00 GMT" "1970-06-01 12:00:00 GMT" "1974-02-01 12:00:00 GMT"
 [5] "1974-10-01 12:00:00 GMT" "1979-05-01 12:00:00 GMT" "1983-06-01 12:00:00 GMT" "1987-06-01 12:00:00 GMT"
 [9] "1992-04-01 12:00:00 GMT" "1997-05-01 12:00:00 GMT" "2001-06-01 12:00:00 GMT" "2005-05-01 12:00:00 GMT"
[13] "2015-05-01 12:00:00 GMT" "2017-06-01 12:00:00 GMT"
UKLib.ntok <- ntoken(UKLib.corpus)
milit.freq <- as.vector(UKLib.milecon.dfm[,"Military"])
econ.freq <- as.vector(UKLib.milecon.dfm[,"Economy"])
milit.prop <- milit.freq/UKLib.ntok
econ.prop <- econ.freq/UKLib.ntok

We plot the frequency of tokens over time:

op <- par(mfrow=c(2,1),mar=c(3,4,0,0))
plot(time,milit.prop,type="p",ylab="Military")
lines(time,lowess(time,milit.prop)$y)
plot(time,econ.prop,type="p",ylab="Economy")
lines(time,lowess(time,econ.prop)$y)
par(op)