2 Module : Gargantext.Core.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.Terms.WithList where
18 import Data.List (null)
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Gargantext.Core.Types (TermsCount)
25 import Gargantext.Core.Utils (groupWithCounts)
26 import Prelude (error)
27 import qualified Data.Algorithms.KMP as KMP
28 import qualified Data.IntMap.Strict as IntMap
29 import qualified Data.List as List
30 ------------------------------------------------------------------------
32 data Pattern = Pattern
33 { _pat_table :: !(KMP.Table Text)
35 , _pat_terms :: ![Text]
37 type Patterns = [Pattern]
39 ------------------------------------------------------------------------
40 replaceTerms :: Patterns -> [Text] -> [[Text]]
41 replaceTerms pats terms = go 0
43 terms_len = length terms
45 go ix | ix >= terms_len = []
47 case IntMap.lookup ix m of
48 Nothing -> go (ix + 1)
53 merge (len1, lab1) (len2, lab2) =
54 if len2 < len1 then (len1, lab1) else (len2, lab2)
57 IntMap.fromListWith merge
59 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
61 buildPatterns :: TermList -> Patterns
62 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
64 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
66 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
67 | null alt = error "buildPatterns: ERR2"
69 Pattern (KMP.build alt) (length alt) label
70 --(Terms label $ Set.empty) -- TODO check stems
73 --------------------------------------------------------------------------
76 type MatchedText = Text
77 termsInText :: Patterns -> BlockText -> [(MatchedText, TermsCount)]
78 termsInText pats txt = groupWithCounts
81 $ extractTermsWithList pats txt
83 --------------------------------------------------------------------------
85 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
86 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
89 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
90 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
92 extractTermsWithList' :: Patterns -> Text -> [Text]
93 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
96 --------------------------------------------------------------------------
99 filterWith :: TermList
103 filterWith termList f xs = filterWith' termList f zip xs
106 filterWith' :: TermList
108 -> ([a] -> [[Text]] -> [b])
111 filterWith' termList f f' xs = f' xs
112 $ map (extractTermsWithList' pats)
115 pats = buildPatterns termList