2 Module : Gargantext.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE BangPatterns #-}
17 module Gargantext.Text.Terms.WithList where
19 import qualified Data.Algorithms.KMP as KMP
20 import Data.Text (Text)
21 import qualified Data.IntMap.Strict as IntMap
23 import Gargantext.Core.Types (Terms(Terms))
24 import Gargantext.Text.Context
25 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
27 import Gargantext.Prelude
28 import Data.List (concatMap)
30 import qualified Data.Set as Set
33 ------------------------------------------------------------------------
35 data Pattern = Pattern
36 { _pat_table :: !(KMP.Table Term)
38 , _pat_terms :: !Terms
40 type Patterns = [Pattern]
42 ------------------------------------------------------------------------
44 replaceTerms :: Patterns -> Sentence Term -> Sentence Terms
45 replaceTerms pats terms = go 0
47 terms_len = length terms
49 go ix | ix >= terms_len = []
51 case IntMap.lookup ix m of
52 Nothing -> go (ix + 1)
57 merge (len1, lab1) (len2, lab2) =
58 if len2 < len1 then (len1, lab1) else (len2, lab2)
61 IntMap.fromListWith merge
63 | Pattern pat len terms <- pats, ix <- KMP.match pat terms ]
65 buildPatterns :: TermList -> Patterns
66 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
68 buildPattern (label, alts) = map f (label : alts)
70 f alt = Pattern (KMP.build alt) (length alt)
71 (Terms label $ Set.empty) -- TODO check stems
73 extractTermsWithList :: Patterns -> Text -> Corpus Terms
74 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence