JuliaText · aviks · Jun 24, 2019 · May 28, 2019 · May 28, 2019 · May 29, 2019
diff --git a/docs/src/features.md b/docs/src/features.md
@@ -226,6 +226,32 @@ julia> summarize(s, ns=2)
  "This has too foo sentences."
 ```
 
+## Tagging_schemes
+
+There are many tagging schemes used for sequence labelling.
+TextAnalysis currently offers functions for conversion between these tagging format.
+
+*   BIO1
+*   BIO2
+*   BIOES
+
+```julia
+julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]
+
+julia> tag_scheme!(tags, "BIO1", "BIOES")
+
+julia> tags
+8-element Array{String,1}:
+ "S-LOC"
+ "O"
+ "S-PER"
+ "B-MISC"
+ "E-MISC"
+ "B-PER"
+ "I-PER"
+ "E-PER"
+```
+
 ## Parts of Speech Tagger
 
 This tagger can be used to find the POS tag of a word or token in a given sentence. It is a based on `Average Perceptron Algorithm`.

diff --git a/src/TextAnalysis.jl b/src/TextAnalysis.jl
@@ -50,6 +50,7 @@ module TextAnalysis
     export strip_numbers, strip_non_letters, strip_indefinite_articles, strip_definite_articles, strip_articles
     export strip_prepositions, strip_pronouns, strip_stopwords, strip_sparse_terms, strip_frequent_terms, strip_html_tags
     export SentimentAnalyzer
+    export tag_scheme!
     export jackknife_avg, listify_ngrams, weighted_lcs, fmeasure_lcs
     export rouge_l_summary, rouge_l_sentence, rouge_n
     export PerceptronTagger, fit!, predict
@@ -78,6 +79,7 @@ module TextAnalysis
     include("sentiment.jl")
     include("bayes.jl")
     include("deprecations.jl")
+    include("tagging_schemes.jl")
     include("utils.jl")
     include("rouge.jl")
     include("averagePerceptronTagger.jl")

diff --git a/src/tagging_schemes.jl b/src/tagging_schemes.jl
@@ -0,0 +1,146 @@
+# Ref:
+# https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
+# https://chameleonmetadata.com/Education/NLP-3/ref_nlp_encoding_schemes_list.php
+
+abstract type tag_scheme end
+
+struct BIO1 <: tag_scheme end # BIO
+struct BIO2 <: tag_scheme end
+struct BIOES <: tag_scheme end
+
+const available_schemes = ["BIO1", "BIO2", "BIOES"]
+
+"""
+    tag_scheme!(tags, current_scheme::String, new_scheme::String)
+
+Convert `tags` from `current_scheme` to `new_scheme`.
+
+List of tagging schemes currently supported-
+ * BIO1 (BIO)
+ * BIO2
+ * BIOES
+
+# Example
+```julia-repl
+julia> tags = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER", "I-PER", "I-PER"]
+
+julia> tag_scheme!(tags, "BIO1", "BIOES")
+
+julia> tags
+8-element Array{String,1}:
+ "S-LOC"
+ "O"
+ "S-PER"
+ "B-MISC"
+ "E-MISC"
+ "B-PER"
+ "I-PER"
+ "E-PER"
+```
+"""
+function tag_scheme!(tags, current_scheme::String, new_scheme::String)
+    current_scheme = uppercase(current_scheme)
+    new_scheme = uppercase(new_scheme)
+    (length(tags) == 0 || current_scheme == new_scheme) && return
+
+    if new_scheme ∉ available_schemes || current_scheme ∉ available_schemes
+        error("Invalid tagging scheme")
+    end
+
+    current_scheme = eval(Symbol(current_scheme))()
+    new_scheme = eval(Symbol(new_scheme))()
+
+    tag_scheme!(tags, current_scheme, new_scheme)
+end
+
+function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIO2)
+    for i in eachindex(tags)
+        if tags[i] == 'O' || tags[i][1] == "O"
+            tags[i] = "O"
+            continue
+        end
+        (tags[i][1] == 'O' || tags[i][1] == 'B') && continue
+
+        if tags[i][1] == 'I'
+            if i == 1
+                tags[i] = 'B' * tags[i][2:end]
+            elseif tags[i - 1] == "O" || tags[i - 1][2:end] != tags[i][2:end]
+                tags[i] = 'B' * tags[i][2:end]
+            else
+                continue
+            end
+        else
+            error("Invalid tags")
+        end
+    end
+end
+
+function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIO1)
+    for i in eachindex(tags)
+        if tags[i] == 'O' || tags[i][1] == "O"
+            tags[i] = "O"
+            continue
+        end
+        (tags[i][1] == 'O' || tags[i][1] == 'I') && continue
+
+        if tags[i][1] == 'B'
+            if i == length(tags)
+                tags[i] = 'I' * tags[i][2:end]
+            elseif tags[i + 1] == "O" || tags[i + 1][2:end] != tags[i][2:end]
+                tags[i] = 'I' * tags[i][2:end]
+            else
+                continue
+            end
+        else
+            error("Invalid tags")
+        end
+    end
+end
+
+function tag_scheme!(tags, current_scheme::BIO2, new_scheme::BIOES)
+    for i in eachindex(tags)
+        if tags[i] == 'O' || tags[i][1] == 'O'
+            tags[i] = "O"
+            continue
+        end
+
+        if tags[i][1] == 'I' && (i == length(tags) ||
+                                 tags[i+1][2:end] != tags[i][2:end])
+            tags[i] = 'E' * tags[i][2:end]
+        elseif tags[i][1] == 'B' && (i == length(tags) ||
+                                 tags[i+1][2:end] != tags[i][2:end])
+            tags[i] = 'S' * tags[i][2:end]
+        else
+            (tags[i][1] == 'I' || tags[i][1] == 'B') && continue
+            error("Invalid tags")
+        end
+    end
+end
+
+function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO2)
+    for i in eachindex(tags)
+        if tags[i] == 'O' || tags[i][1] == 'O'
+            tags[i] = "O"
+            continue
+        end
+        (tags[i][1] == 'B' || tags[i][1] == 'I') && continue
+
+        if tags[i][1] == 'E'
+            tags[i] = 'I' * tags[i][2:end]
+        elseif tags[i][1] == 'S'
+            tags[i] = 'B' * tags[i][2:end]
+        else
+            error("Invalid tags")
+        end
+    end
+end
+
+function tag_scheme!(tags, current_scheme::BIO1, new_scheme::BIOES)
+    tag_scheme!(tags, BIO1(), BIO2())
+    tag_scheme!(tags, BIO2(), BIOES())
+end
+
+function tag_scheme!(tags, current_scheme::BIOES, new_scheme::BIO1)
+    tag_scheme!(tags, BIOES(), BIO2())
+    tag_scheme!(tags, BIO2(), BIO1())
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -22,6 +22,7 @@ include("lda.jl")
 include("summarizer.jl")
 include("sentiment.jl")
 include("bayes.jl")
+include("taggingschemes.jl")
 include("rouge.jl")
 include("averagePerceptronTagger.jl")
 

diff --git a/test/taggingschemes.jl b/test/taggingschemes.jl
@@ -0,0 +1,41 @@
+@testset "Tagging_Schemes" begin
+    @testset "BIO1 and BIO2" begin
+        tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "I-ORG"]
+        tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-ORG"]
+
+        output_tags = deepcopy(tags_BIO1)
+        tag_scheme!(tags_BIO1, "BIO1", "BIO2")
+        @test tags_BIO1 == tags_BIO2
+
+        tag_scheme!(tags_BIO1, "BIO2", "BIO1")
+        @test tags_BIO1 == output_tags
+    end
+
+    @testset "BIO1 and BIOES" begin
+        tags_BIO1 = ["I-LOC", "O", "I-PER", "B-MISC", "I-MISC", "B-PER",
+                                                        "I-PER", "I-PER"]
+        tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER",
+                                                        "I-PER", "E-PER"]
+
+        output_tags = deepcopy(tags_BIO1)
+        tag_scheme!(tags_BIO1, "BIO1", "BIOES")
+        @test tags_BIO1 == tags_BIOES
+
+        tag_scheme!(tags_BIO1, "BIOES", "BIO1")
+        @test tags_BIO1 == output_tags
+    end
+
+    @testset "BIO2 and BIOES" begin
+        tags_BIO2 = ["B-LOC", "O", "B-PER", "B-MISC", "I-MISC", "B-PER",
+                                                        "I-PER", "I-PER"]
+        tags_BIOES = ["S-LOC", "O", "S-PER", "B-MISC", "E-MISC", "B-PER",
+                                                        "I-PER", "E-PER"]
+
+        output_tags = deepcopy(tags_BIO2)
+        tag_scheme!(tags_BIO2, "BIO2", "BIOES")
+        @test tags_BIO2 == tags_BIOES
+
+        tag_scheme!(tags_BIO2, "BIOES", "BIO2")
+        @test tags_BIO2 == output_tags
+    end
+end