2 Module : Gargantext.Core.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.Terms.WithList where
18 import Data.List (null)
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Gargantext.Core.Types (TermsCount)
25 import Gargantext.Core.Utils (groupWithCounts)
26 import Prelude (error)
27 import qualified Data.Algorithms.KMP as KMP
28 import qualified Data.IntMap.Strict as IntMap
29 import qualified Data.List as List
30 ------------------------------------------------------------------------
32 data Pattern = Pattern
33 { _pat_table :: !(KMP.Table Text)
35 , _pat_terms :: ![Text]
37 type Patterns = [Pattern]
39 ------------------------------------------------------------------------
40 replaceTerms :: Patterns -> [Text] -> [[Text]]
41 replaceTerms pats terms = go 0
43 terms_len = length terms
45 go ix | ix >= terms_len = []
47 case IntMap.lookup ix m of
48 Nothing -> go (ix + 1)
53 merge (len1, lab1) (len2, lab2) =
54 if len2 < len1 then (len1, lab1) else (len2, lab2)
57 IntMap.fromListWith merge
59 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
61 buildPatterns :: TermList -> Patterns
62 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
64 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
66 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
67 | null alt = error "buildPatterns: ERR2"
69 Pattern (KMP.build alt) (length alt) label
70 --(Terms label $ Set.empty) -- TODO check stems
73 --------------------------------------------------------------------------
75 type MatchedText = Text
76 termsInText :: Patterns -> Text -> [(MatchedText, TermsCount)]
77 termsInText pats txt = groupWithCounts
80 $ extractTermsWithList pats txt
82 --------------------------------------------------------------------------
84 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
85 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
88 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
89 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
91 extractTermsWithList' :: Patterns -> Text -> [Text]
92 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
95 --------------------------------------------------------------------------
98 filterWith :: TermList
102 filterWith termList f xs = filterWith' termList f zip xs
105 filterWith' :: TermList
107 -> ([a] -> [[Text]] -> [b])
110 filterWith' termList f f' xs = f' xs
111 $ map (extractTermsWithList' pats)
114 pats = buildPatterns termList