2 Module : Gargantext.Core.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.Terms.WithList where
18 import Data.List (null, concatMap)
20 import Data.Text (Text, concat)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Prelude (error)
25 import qualified Data.Algorithms.KMP as KMP
26 import qualified Data.IntMap.Strict as IntMap
28 ------------------------------------------------------------------------
30 data Pattern = Pattern
31 { _pat_table :: !(KMP.Table Text)
33 , _pat_terms :: ![Text]
35 type Patterns = [Pattern]
37 ------------------------------------------------------------------------
38 replaceTerms :: Patterns -> [Text] -> [[Text]]
39 replaceTerms pats terms = go 0
41 terms_len = length terms
43 go ix | ix >= terms_len = []
45 case IntMap.lookup ix m of
46 Nothing -> go (ix + 1)
51 merge (len1, lab1) (len2, lab2) =
52 if len2 < len1 then (len1, lab1) else (len2, lab2)
55 IntMap.fromListWith merge
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
59 buildPatterns :: TermList -> Patterns
60 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
62 buildPattern (label, alts) = map f (label : alts)
64 f alt | "" `elem` alt = error "buildPatterns: ERR1"
65 | null alt = error "buildPatterns: ERR2"
67 Pattern (KMP.build alt) (length alt) label
68 --(Terms label $ Set.empty) -- TODO check stems
70 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
71 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
74 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
75 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
77 extractTermsWithList' :: Patterns -> Text -> [Text]
78 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
82 filterWith :: TermList
86 filterWith termList f xs = filterWith' termList f zip xs
89 filterWith' :: TermList
91 -> ([a] -> [[Text]] -> [b])
94 filterWith' termList f f' xs = f' xs
95 $ map (extractTermsWithList' pats)
98 pats = buildPatterns termList