2 Module : Gargantext.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE BangPatterns #-}
17 module Gargantext.Text.Terms.WithList where
19 import Prelude (String)
20 import qualified Data.Algorithms.KMP as KMP
21 import Data.Char (isSpace)
22 import qualified Data.Text as T
23 import Data.Text (Text)
24 import qualified Data.IntMap.Strict as IntMap
26 import Gargantext.Text.Context
27 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
29 import Gargantext.Prelude
30 import Data.List (concatMap)
34 type Pattern = KMP.Table Term
35 type TermList = [(Label, [[Term]])]
36 type Patterns = [(Pattern, Int, Label)]
39 replaceTerms :: (Term -> Label) -> Patterns -> Sentence Term -> Sentence Label
40 replaceTerms labelPolicy pats terms = go 0 terms
44 case IntMap.lookup ix m of
45 Nothing -> t : go (ix + 1) ts
47 label : go (ix + len) (drop (len - 1) ts)
49 -- | merge with labelPolicy (can be a Map Term label)
50 merge (len1, lab1) (len2, lab2) =
51 if (labelPolicy lab1) == lab2 then (len2, lab2) else (len1, lab1)
54 IntMap.fromListWith merge
56 | (pat, len, label) <- pats, ix <- KMP.match pat terms ]
58 buildPatterns :: TermList -> Patterns
59 buildPatterns = concatMap buildPattern
61 buildPattern (label, alts) = map f alts
63 f alt = (KMP.build alt, length alt, label)
65 extractTermsWithList :: (Term -> Label) -> Patterns -> Text -> Corpus Label
66 extractTermsWithList labelPolicy pats = map (replaceTerms labelPolicy pats) . monoTextsBySentence