Directional coom (#264)

* Adding the directional coocurrence matrix. * Added tests
JuliaText · Jan 9, 2024 · ce69b59 · ce69b59
1 parent 0b5992c
commit ce69b59
Show file tree

Hide file tree

Showing 2 changed files with 116 additions and 54 deletions.
diff --git a/src/coom.jl b/src/coom.jl
@@ -8,14 +8,14 @@
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 """
-    coo_matrix(::Type{T}, doc::Vector{AbstractString}, vocab::OrderedDict{AbstractString, Int}, window::Int, normalize::Bool)
+    coo_matrix(::Type{T}, doc::Vector{AbstractString}, vocab::OrderedDict{AbstractString, Int}, window::Int, normalize::Bool, mode::Symbol)
 
 Basic low-level function that calculates the co-occurrence matrix of a document.
 Returns a sparse co-occurrence matrix sized `n × n` where `n = length(vocab)`
 with elements of type `T`. The document `doc` is represented by a vector of its
 terms (in order)`. The keywords `window` and `normalize` indicate the size of the
 sliding word window in which co-occurrences are counted and whether to normalize
-of not the counts by the distance between word positions.
+of not the counts by the distance between word positions. The `mode` keyword can be either `:default` or `:directional` and indicates whether the co-occurrence  matrix should be directional or not. This means that if `mode` is `:directional` then the co-occurrence matrix will be a `n × n` matrix where `n = length(vocab)` and `coom[i,j]` will be the number of times `vocab[i]` co-occurs with `vocab[j]` in the document `doc`. If `mode` is `:default` then the co-occurrence matrix will be a `n × n` matrix where `n = length(vocab)` and `coom[i,j]` will be twice the number of times `vocab[i]` co-occurs with `vocab[j]` in the document `doc` (once for each direction, from i to j + from j to i).
 
 # Example
 ```
@@ -30,28 +30,48 @@ julia> using TextAnalysis, DataStructures
   [1, 2]  =  2.0
   [3, 2]  =  0.3999
   [2, 3]  =  0.3999
+
+julia> using TextAnalysis, DataStructures
+       doc = StringDocument("This is a text about an apple. There are many texts about apples.")
+       docv = TextAnalysis.tokenize(language(doc), text(doc))
+       vocab = OrderedDict("This"=>1, "is"=>2, "apple."=>3)
+       TextAnalysis.coo_matrix(Float16, docv, vocab, 5, true, :directional)
+
+3×3 SparseArrays.SparseMatrixCSC{Float16,Int64} with 4 stored entries:
+  [2, 1]  =  1.0
+  [1, 2]  =  1.0
+  [3, 2]  =  0.1999
+  [2, 3]  =  0.1999
 ```
 """
 function coo_matrix(::Type{T},
-                    doc::Vector{<:AbstractString},
-                    vocab::OrderedDict{<:AbstractString, Int},
-                    window::Int,
-                    normalize::Bool=true) where T<:AbstractFloat
+    doc::Vector{<:AbstractString},
+    vocab::OrderedDict{<:AbstractString,
+        Int},
+    window::Int,
+    normalize::Bool=true,
+    mode::Symbol=:default) where {T<:AbstractFloat}
+    # Initializations
     n = length(vocab)
     m = length(doc)
     coom = spzeros(T, n, n)
     # Count co-occurrences
     for (i, token) in enumerate(doc)
+        inner_range = if mode == :directional
+            i:min(m, i + window)
+        else
+            max(1, i - window):min(m, i + window)
+        end
         row = get(vocab, token, nothing)
         isnothing(row) && continue
 
-        @inbounds for j in max(1, i - window):min(m, i + window)
+        # looking forward
+        @inbounds for j in inner_range
             i == j && continue
 
             wtoken = doc[j]
             col = get(vocab, wtoken, nothing)
             isnothing(col) && continue
-
             nm = T(ifelse(normalize, abs(i - j), 1))
             coom[row, col] += one(T) / nm
             coom[col, row] = coom[row, col]
@@ -60,9 +80,9 @@ function coo_matrix(::Type{T},
     return coom
 end
 
-coo_matrix(::Type{T}, doc::Vector{<:AbstractString}, vocab::Dict{<:AbstractString, Int},
-                    window::Int, normalize::Bool=true) where T<:AbstractFloat =
-            coo_matrix(T, doc, OrderedDict(vocab), window, normalize)
+coo_matrix(::Type{T}, doc::Vector{<:AbstractString}, vocab::Dict{<:AbstractString,Int},
+    window::Int, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} =
+    coo_matrix(T, doc, OrderedDict(vocab), window, normalize, mode)
 
 """
 Basic Co-occurrence Matrix (COOM) type.
@@ -75,9 +95,9 @@ the document or corpus
 columns of the co-occurrence matrix
 """
 struct CooMatrix{T}
-    coom::SparseMatrixCSC{T, Int}
+    coom::SparseMatrixCSC{T,Int}
     terms::Vector{String}
-    column_indices::OrderedDict{String, Int}
+    column_indices::OrderedDict{String,Int}
 end
 
 
@@ -91,66 +111,68 @@ can be a `Vector{String}`, an `AbstractDict` where the keys are the lexicon,
 or can be omitted, in which case the `lexicon` field of the corpus is used.
 """
 function CooMatrix{T}(crps::Corpus,
-                      terms::Vector{String};
-                      window::Int=5,
-                      normalize::Bool=true) where T<:AbstractFloat
+    terms::Vector{String};
+    window::Int=5,
+    normalize::Bool=true,
+    mode::Symbol=:default) where {T<:AbstractFloat}
     column_indices = OrderedDict(columnindices(terms))
     n = length(terms)
     coom = spzeros(T, n, n)
     for doc in crps
-        coom .+= coo_matrix(T, tokens(doc), column_indices, window, normalize)
+        coom .+= coo_matrix(T, tokens(doc), column_indices, window, normalize, mode)
     end
     return CooMatrix{T}(coom, terms, column_indices)
 end
 
-CooMatrix(crps::Corpus, terms::Vector{String}; window::Int=5, normalize::Bool=true) =
-    CooMatrix{Float64}(crps, terms, window=window, normalize=normalize)
+CooMatrix(crps::Corpus, terms::Vector{String}; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
+    CooMatrix{Float64}(crps, terms, window=window, normalize=normalize, mode=mode)
 
-CooMatrix{T}(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true
-            ) where T<:AbstractFloat =
-    CooMatrix{T}(crps, collect(keys(lex)), window=window, normalize=normalize)
+CooMatrix{T}(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} =
+    CooMatrix{T}(crps, collect(keys(lex)), window=window, normalize=normalize, mode=mode)
 
-CooMatrix(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true) =
-    CooMatrix{Float64}(crps, lex, window=window, normalize=normalize)
+CooMatrix(crps::Corpus, lex::AbstractDict; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
+    CooMatrix{Float64}(crps, lex, window=window, normalize=normalize, mode=mode)
 
-CooMatrix{T}(crps::Corpus; window::Int=5, normalize::Bool=true) where T<:AbstractFloat = begin
+CooMatrix{T}(crps::Corpus; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} = begin
     isempty(lexicon(crps)) && update_lexicon!(crps)
-    CooMatrix{T}(crps, lexicon(crps), window=window, normalize=normalize)
+    CooMatrix{T}(crps, lexicon(crps), window=window, normalize=normalize, mode=mode)
 end
 
-CooMatrix(crps::Corpus; window::Int=5, normalize::Bool=true) = begin
+CooMatrix(crps::Corpus; window::Int=5, normalize::Bool=true, mode::Symbol=:default) = begin
     isempty(lexicon(crps)) && update_lexicon!(crps)
-    CooMatrix{Float64}(crps, lexicon(crps), window=window, normalize=normalize)
+    CooMatrix{Float64}(crps, lexicon(crps), window=window, normalize=normalize, mode=mode)
 end
 
 # Document methods
 function CooMatrix{T}(doc::AbstractDocument,
-                      terms::Vector{String};
-                      window::Int=5,
-                      normalize::Bool=true) where T<:AbstractFloat
+    terms::Vector{String};
+    window::Int=5,
+    normalize::Bool=true,
+    mode::Symbol=:default) where {T<:AbstractFloat}
     # Initializations
     column_indices = OrderedDict(columnindices(terms))
-    coom = coo_matrix(T, tokens(doc), column_indices, window, normalize)
+    coom = coo_matrix(T, tokens(doc), column_indices, window, normalize, mode)
     return CooMatrix{T}(coom, terms, column_indices)
 end
 
 function CooMatrix{T}(doc::NGramDocument,
-                      terms::Vector{String};
-                      window::Int=5,
-                      normalize::Bool=true) where T <: AbstractFloat
+    terms::Vector{String};
+    window::Int=5,
+    normalize::Bool=true,
+    mode::Symbol=:default) where {T<:AbstractFloat}
     error("The Co occurrence matrix of an NGramDocument can't be created.")
 end
 
-CooMatrix(doc, terms::Vector{String}; window::Int=5, normalize::Bool=true) =
-    CooMatrix{Float64}(doc, terms, window=window, normalize=normalize)
+CooMatrix(doc, terms::Vector{String}; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
+    CooMatrix{Float64}(doc, terms, window=window, normalize=normalize, mode=mode)
 
-function CooMatrix{T}(doc; window::Int=5, normalize::Bool=true) where T<:AbstractFloat
+function CooMatrix{T}(doc; window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat}
     terms = unique(String.(tokens(doc)))
-    CooMatrix{T}(doc, terms, window=window, normalize=normalize)
+    CooMatrix{T}(doc, terms, window=window, normalize=normalize, mode=mode)
 end
 
-CooMatrix(doc; window::Int=5, normalize::Bool=true) =
-    CooMatrix{Float64}(doc, window=window, normalize=normalize)
+CooMatrix(doc; window::Int=5, normalize::Bool=true, mode::Symbol=:default) =
+    CooMatrix{Float64}(doc, window=window, normalize=normalize, mode=mode)
 
 """
     coom(c::CooMatrix)
@@ -167,5 +189,5 @@ with the `entity`. The `CooMatrix{T}` will first have to
 be created in order for the actual matrix to be accessed.
 """
 coom(entity, eltype::Type{T}=Float;
-        window::Int=5, normalize::Bool=true) where T<:AbstractFloat =
-    coom(CooMatrix{T}(entity, window=window, normalize=normalize))
+    window::Int=5, normalize::Bool=true, mode::Symbol=:default) where {T<:AbstractFloat} =
+    coom(CooMatrix{T}(entity, window=window, normalize=normalize, mode=mode))
diff --git a/test/coom.jl b/test/coom.jl
@@ -1,14 +1,15 @@
 @testset "COOM (Co-occurence Matrix)" begin
     doc_raw = StringDocument("This is a document. It has two sentences.")
-    prepare!(doc_raw, strip_punctuation|strip_whitespace|strip_case)
+    prepare!(doc_raw, strip_punctuation | strip_whitespace | strip_case)
     doc = text(doc_raw)
     sd = StringDocument(doc)
     td = TokenDocument(doc)
     nd = NGramDocument(doc)
     crps = Corpus([sd, td])
     T = Float16
     # Results for window = 5, all terms in document used
-    expected_result = [ # for window == 5
+    # expected_result_C is the expected matrix for the normalized and default mode case.
+    expected_result_C = [ # for window == 5
         0.0 2.0 1.0 2/3 0.5 0.4 0.0 0.0
         2.0 0.0 2.0 1.0 2/3 0.5 0.4 0.0
         1.0 2.0 0.0 2.0 1.0 2/3 0.5 0.4
@@ -17,14 +18,28 @@
         0.4 0.5 2/3 1.0 2.0 0.0 2.0 1.0
         0.0 0.4 0.5 2/3 1.0 2.0 0.0 2.0
         0.0 0.0 0.4 0.5 2/3 1.0 2.0 0.0]
+
+    # expected_result_D is the expected matrix for the normalized and directional mode case.
+    expected_result_D = [ # for window == 5
+        0.0 1.0 0.5 1/3 0.25 0.2 0.0 0.0
+        1.0 0.0 1.0 0.5 1/3 0.25 0.2 0.0
+        0.5 1.0 0.0 1.0 0.5 1/3 0.25 0.2
+        1/3 0.5 1.0 0.0 1.0 0.5 1/3 0.25
+        0.25 1/3 0.5 1.0 0.0 1.0 0.5 1/3
+        0.2 0.25 1/3 0.5 1.0 0.0 1.0 0.5
+        0.0 0.2 0.25 1/3 0.5 1.0 0.0 1.0
+        0.0 0.0 0.2 0.25 1/3 0.5 1.0 0.0]
     # Verify untyped constructor
     terms = tokens(td)
     for d in [sd, td, crps]
         C = TextAnalysis.CooMatrix(d, terms)
+        D = TextAnalysis.CooMatrix(d, terms, mode=:directional)
         if !(d isa Corpus)
-            @test TextAnalysis.coom(C) == expected_result
+            @test TextAnalysis.coom(C) == expected_result_C
+            @test TextAnalysis.coom(D) == expected_result_D
         else
-            @test TextAnalysis.coom(C) == length(crps) * expected_result
+            @test TextAnalysis.coom(C) == length(crps) * expected_result_C
+            @test TextAnalysis.coom(D) == length(crps) * expected_result_D
         end
     end
     @test_throws ErrorException TextAnalysis.CooMatrix(nd)
@@ -33,28 +48,53 @@
     terms = tokens(td)
     for d in [sd, td, crps]
         C = TextAnalysis.CooMatrix{T}(d, terms)
+        D = TextAnalysis.CooMatrix{T}(d, terms, mode=:directional)
         @test C isa TextAnalysis.CooMatrix{T}
         if !(d isa Corpus)
-            @test TextAnalysis.coom(C) == T.(expected_result)
+            @test TextAnalysis.coom(C) == T.(expected_result_C)
+            @test TextAnalysis.coom(D) == T.(expected_result_D)
         else
-            @test TextAnalysis.coom(C) == length(crps) * T.(expected_result)
+            @test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C)
+            @test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D)
         end
     end
     @test_throws ErrorException TextAnalysis.CooMatrix{T}(nd)
 
     # Results for window = 1, custom terms
     terms = ["this", "document", "it"]
-    expected_result = [0.0 0.0 0.0; # document
-                       0.0 0.0 2.0; # it
-                       0.0 2.0 0.0] # this
+    expected_result_C = [0.0 0.0 0.0; # document
+        0.0 0.0 2.0; # it
+        0.0 2.0 0.0] # this
+
+    expected_result_D = [0.0 0.0 0.0; # document
+        0.0 0.0 1.0; # it
+        0.0 1.0 0.0] # this
+
     # Verify untyped constructor
+    for d in [sd, td, crps]
+        C = TextAnalysis.CooMatrix(d, terms, window=1)
+        D = TextAnalysis.CooMatrix(d, terms, window=1, mode=:directional)
+        if !(d isa Corpus)
+            @test TextAnalysis.coom(C) == T.(expected_result_C)
+            @test TextAnalysis.coom(D) == T.(expected_result_D)
+        else
+            @test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C)
+            @test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D)
+        end
+    end
+    @test_throws ErrorException TextAnalysis.CooMatrix(nd)
+
+    # Verify typed constructor
     for d in [sd, td, crps]
         C = TextAnalysis.CooMatrix{T}(d, terms, window=1)
+        D = TextAnalysis.CooMatrix{T}(d, terms, window=1, mode=:directional)
         @test C isa TextAnalysis.CooMatrix{T}
         if !(d isa Corpus)
-            @test TextAnalysis.coom(C) == T.(expected_result)
+            @test TextAnalysis.coom(C) == T.(expected_result_C)
+            @test TextAnalysis.coom(D) == T.(expected_result_D)
         else
-            @test TextAnalysis.coom(C) == length(crps) * T.(expected_result)
+            @test TextAnalysis.coom(C) == length(crps) * T.(expected_result_C)
+            @test TextAnalysis.coom(D) == length(crps) * T.(expected_result_D)
         end
     end
     @test_throws ErrorException TextAnalysis.CooMatrix{T}(nd)