JuliaText · Ayushk4 · Aug 17, 2019 · Aug 13, 2019 · Aug 15, 2019 · Aug 15, 2019
diff --git a/docs/make.jl b/docs/make.jl
@@ -12,6 +12,7 @@ makedocs(modules = [CorpusLoaders],
              "StanfordSentimentTreebank" => "StanfordSentimentTreebank.md",
              "Twitter" => "Twitter.md",
              "WikiCorpus" => "WikiCorpus.md",
+             "WikiGold" => "WikiGold.md",
              "API References" => "APIReference.md"
         ])
 

diff --git a/docs/src/WikiGold.md b/docs/src/WikiGold.md
@@ -0,0 +1,149 @@
+# WikiGold
+
+WikiGold is a manually annotated corpus for named entity recognition
+made up of a small sample of Wikipedia articles.
+The words have been labelled for each of the four CONLL-03 Named Enitty classes (LOC, MISC, ORG, PER).
+
+The named entity tags are tagged
+with the BIO1 or IOB format,
+[refer](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)).
+
+One can load the dataset using -
+
+    dataset = load(WikiGold())
+
+Structure of each set is - `document, sentences, words, characters`.
+
+```julia
+
+julia> typeof(dataset)
+Document{Array{Array{Array{CorpusLoaders.WikiGoldWord,1},1},1},String}
+
+julia>
+
+```
+
+To get the desired levels, `flatten_levels` function from [MultiResolutionIterators.jl](https://github.com/oxinabox/MultiResolutionIterators.jl) can be used.
+
+Following is example to remove sentence boundary.
+The document contain only a sequence of words.
+
+```julia
+julia> using MultiResolutionIterators
+
+julia> no_sent_boundary = flatten_levels(dataset, lvls(WikiGold, :sentence)) |> full_consolidate
+
+julia> typeof(no_sent_boundary)
+Document{Array{Array{CorpusLoaders.WikiGoldWord,1},1},String}
+
+julia> no_sent_boundary[1]
+142-element Array{CorpusLoaders.WikiGoldWord,1}:
+ CorpusLoaders.WikiGoldWord("I-MISC", "010")
+ CorpusLoaders.WikiGoldWord("O", "is")
+ CorpusLoaders.WikiGoldWord("O", "the")
+ CorpusLoaders.WikiGoldWord("O", "tenth")
+ CorpusLoaders.WikiGoldWord("O", "album")
+ CorpusLoaders.WikiGoldWord("O", "from")
+ CorpusLoaders.WikiGoldWord("I-MISC", "Japanese")
+ CorpusLoaders.WikiGoldWord("O", "Punk")
+ ⋮
+ CorpusLoaders.WikiGoldWord("O", "and")
+ CorpusLoaders.WikiGoldWord("O", "most")
+ CorpusLoaders.WikiGoldWord("O", "of")
+ CorpusLoaders.WikiGoldWord("O", "the")
+ CorpusLoaders.WikiGoldWord("O", "tracks")
+ CorpusLoaders.WikiGoldWord("O", "were")
+ CorpusLoaders.WikiGoldWord("O", "re-engineered")
+ CorpusLoaders.WikiGoldWord("O", ".")
+```
+
+Similarly we can flatten_levels at the document level
+to get a set of sentences of tagged words.
+
+```julia
+julia> my_dataset = flatten_levels(dataset, lvls(WikiGold, :document)) |> full_consolidate
+
+julia> typeof(my_dataset)
+Document{Array{Array{CorpusLoaders.WikiGoldWord,1},1},String}
+
+julia> length(my_dataset) # Gives us total number of sentences
+1696
+
+
+julia> my_dataset[1]
+15-element Array{CorpusLoaders.WikiGoldWord,1}:
+ CorpusLoaders.WikiGoldWord("I-MISC", "010")     
+ CorpusLoaders.WikiGoldWord("O", "is")           
+ CorpusLoaders.WikiGoldWord("O", "the")          
+ CorpusLoaders.WikiGoldWord("O", "tenth")        
+ CorpusLoaders.WikiGoldWord("O", "album")        
+ CorpusLoaders.WikiGoldWord("O", "from")         
+ CorpusLoaders.WikiGoldWord("I-MISC", "Japanese")
+ CorpusLoaders.WikiGoldWord("O", "Punk")         
+ CorpusLoaders.WikiGoldWord("O", "Techno")       
+ CorpusLoaders.WikiGoldWord("O", "band")         
+ CorpusLoaders.WikiGoldWord("I-ORG", "The")      
+ CorpusLoaders.WikiGoldWord("I-ORG", "Mad")      
+ CorpusLoaders.WikiGoldWord("I-ORG", "Capsule")  
+ CorpusLoaders.WikiGoldWord("I-ORG", "Markets")  
+ CorpusLoaders.WikiGoldWord("O", ".")  
+```
+
+Accessing the tag could simply be done on the word level using
+
+    CorpusLoaders.named_entity(tagged_word)
+
+```julia
+
+julia> ner_tag = CorpusLoaders.named_entity.(my_dataset[22])
+14-element Array{String,1}:
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "O"
+ "I-LOC"
+ "O"
+
+julia> ws = word.(my_dataset[22])
+14-element Array{String,1}:
+ "By"
+ "December"
+ "1864"
+ ","
+ "they"
+ "were"
+ "back"
+ "in"
+ "the"
+ "siege"
+ "lines"
+ "of"
+ "Petersburg"
+ "."
+
+julia> collect(zip(ws, ner_tag))
+14-element Array{Tuple{String,String},1}:
+ ("By", "O")
+ ("December", "O")
+ ("1864", "O")
+ (",", "O")
+ ("they", "O")
+ ("were", "O")
+ ("back", "O")
+ ("in", "O")
+ ("the", "O")
+ ("siege", "O")
+ ("lines", "O")
+ ("of", "O")
+ ("Petersburg", "I-LOC")
+ (".", "O")
+
+```
diff --git a/src/CorpusLoaders.jl b/src/CorpusLoaders.jl
@@ -11,7 +11,7 @@ export Document, TaggedWord, SenseAnnotatedWord, PosTaggedWord, CoNLL2003TaggedW
 export title, sensekey, word
 export load
 
-export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank
+export WikiCorpus, SemCor, Senseval3, CoNLL, IMDB, Twitter, StanfordSentimentTreebank, WikiGold
 
 function __init__()
     include(joinpath(@__DIR__, "WikiCorpus_DataDeps.jl"))
@@ -22,6 +22,7 @@ function __init__()
     include(joinpath(@__DIR__, "IMDB_DataDeps.jl"))
     include(joinpath(@__DIR__, "Twitter_DataDeps.jl"))
     include(joinpath(@__DIR__, "StanfordSentimentTreebank_DataDeps.jl"))
+    include(joinpath(@__DIR__, "WikiGold_DataDeps.jl"))
 end
 
 include("types.jl")
@@ -34,5 +35,6 @@ include("CoNLL.jl")
 include("IMDB.jl")
 include("Twitter.jl")
 include("StanfordSentimentTreebank.jl")
+include("WikiGold.jl")
 
 end
diff --git a/src/WikiGold.jl b/src/WikiGold.jl
@@ -0,0 +1,61 @@
+struct WikiGold{S}
+    filepath::S
+end
+
+WikiGold() = WikiGold(datadep"WikiGold")
+
+MultiResolutionIterators.levelname_map(::Type{WikiGold}) = [
+    :doc=>1, :document=>1, :article=>1,
+    :sent=>2, :sentence=>2,
+    :word=>3, :token=>3,
+    :char=>4, :character=>4
+    ]
+
+function parse_WikiGold_tagged_word(line::AbstractString)
+    tokens_tags = split(line)
+    length(tokens_tags) != 2 && throw("Error parsing line: \"$line\". Invalid Format.")
+    return WikiGoldWord(tokens_tags[2], tokens_tags[1])
+end
+
+function parse_WikiGoldfile(filename)
+    local sent
+    local doc
+    docs = @NestedVector(WikiGoldWord,3)()
+    context = Document(intern(basename(filename)), docs)
+
+    # structure
+    function new_document()
+        doc = @NestedVector(WikiGoldWord,2)()
+        push!(docs, doc)
+    end
+
+    function new_sentence()
+        sent = @NestedVector(WikiGoldWord,1)()
+        push!(doc, sent)
+    end
+
+    # words
+    get_tagged(line) = push!(sent, parse_WikiGold_tagged_word(line))
+
+    new_document()
+    new_sentence()
+    # parse
+    for line in eachline(filename)
+        if length(line) == 0
+            new_sentence()
+        elseif startswith(strip(line), "-DOCSTART-")
+            length(docs) > 0 && isempty(doc[end]) && deleteat!(doc, lastindex(doc))
+            new_document()
+        else
+            get_tagged(line)
+        end
+    end
+    isempty(vcat(docs[end]...)) && deleteat!(docs, lastindex(docs))
+
+    return context
+end
+
+function load(corpus::WikiGold)
+    file = readdir(corpus.filepath)[1]
+    return parse_WikiGoldfile(joinpath(corpus.filepath, file))
+end
diff --git a/src/WikiGold_DataDeps.jl b/src/WikiGold_DataDeps.jl
@@ -0,0 +1,19 @@
+using DataDeps
+
+register(DataDep("WikiGold",
+    """
+    Website: https://figshare.com/articles/Learning_multilingual_named_entity_recognition_from_Wikipedia/5462500
+
+    WikiGold is a manually annotated corpus over a small sample of Wikipedia articles in CoNLL format (IOB).
+    It contains 145 documents, 1696 sentences and 39152 tokens.
+    The words have been labelled for each of the four CONLL-03 Named Enitty classes (LOC, MISC, ORG, PER).
+
+    Please cite the following publication, if you are using the corpora:
+    Dominic Balasuriya, Nicky Ringland, Joel Nothman, Tara Murphy, James R. Curran, 2009. Named Entity Recognition in Wikipedia, Proceedings of the 2009 Workshop on the People’s Web Meets NLP, ACL-IJCNLP 2009, pages 10–18.
+    https://www.aclweb.org/anthology/papers/W/W09/W09-3302/
+    """,
+    "https://s3-eu-west-1.amazonaws.com/pfigshare-u-files/9446377/wikigold.conll.txt",
+    "c797a64d0cf73ed058363f77671486bcfd413e70fda1726ad4a6ba624455225f";
+    post_fetch_method = function(fn)
+    end
+))
diff --git a/src/types.jl b/src/types.jl
@@ -89,12 +89,21 @@ struct NERTaggedWord <: TaggedWord
     end
 end
 
+struct WikiGoldWord <: TaggedWord
+    ner_tag::String
+    word::String
+    function WikiGoldWord(ner_tag, word)
+        new(intern(ner_tag), intern(word))
+    end
+end
+
 const TaggedSentence = Vector{TaggedWord}
 
 word(tword::TaggedWord) = tword.word
 word(str::AbstractString) = str
 sensekey(saword::SenseAnnotatedWord) = saword.lemma * "%" * saword.lexsn
 named_entity(ner_word::NERTaggedWord) = ner_word.ner_tag
+named_entity(ner_word::WikiGoldWord) = ner_word.ner_tag
 
 #######################
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -8,7 +8,8 @@ files = ["types",
          "IMDB",
          "Twitter",
          "StanfordSentimentTreebank",
-         "wikicorpus"
+         "wikicorpus",
+         "wikigold"
         ]
 
 

diff --git a/test/test_CoNLL.jl b/test/test_CoNLL.jl
@@ -17,7 +17,7 @@ end
     train = load(CoNLL())
     docs = train[1:5]
 
-    words = full_consolidate(flatten_levels(docs, (!lvls)(Senseval3, :word)))
+    words = full_consolidate(flatten_levels(docs, (!lvls)(CoNLL, :word)))
     @test length(words) > length(docs)
     @test length(words) > sum(length.(docs))
     @test typeof(words) == Vector{CorpusLoaders.NERTaggedWord}

diff --git a/test/test_wikigold.jl b/test/test_wikigold.jl
@@ -0,0 +1,27 @@
+using CorpusLoaders
+using Test
+using Base.Iterators
+using MultiResolutionIterators
+
+@testset "Basic use" begin
+    dataset = load(WikiGold())
+
+    @test length(dataset) > 0
+    @test minimum(length.(dataset)) > 0
+end
+
+@testset "Using flatten_levels" begin
+    dataset = load(WikiGold())
+    docs = dataset[1:5]
+
+    words = full_consolidate(flatten_levels(docs, (!lvls)(WikiGold, :word)))
+    @test length(words) > length(docs)
+    @test length(words) > sum(length.(docs))
+    @test typeof(words) == Vector{CorpusLoaders.WikiGoldWord}
+
+    plain_words = word.(words)
+    @test typeof(plain_words) <: Vector{String}
+
+    ner_tags = CorpusLoaders.named_entity.(words)
+    @test typeof(plain_words) <: Vector{String}
+end