2 Module : Gargantext.Core.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.Terms.WithList where
18 import Data.List (null)
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Prelude (error)
25 import qualified Data.Algorithms.KMP as KMP
26 import qualified Data.IntMap.Strict as IntMap
27 import qualified Data.List as List
28 ------------------------------------------------------------------------
30 data Pattern = Pattern
31 { _pat_table :: !(KMP.Table Text)
33 , _pat_terms :: ![Text]
35 type Patterns = [Pattern]
37 ------------------------------------------------------------------------
38 replaceTerms :: Patterns -> [Text] -> [[Text]]
39 replaceTerms pats terms = go 0
41 terms_len = length terms
43 go ix | ix >= terms_len = []
45 case IntMap.lookup ix m of
46 Nothing -> go (ix + 1)
51 merge (len1, lab1) (len2, lab2) =
52 if len2 < len1 then (len1, lab1) else (len2, lab2)
55 IntMap.fromListWith merge
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
59 buildPatterns :: TermList -> Patterns
60 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
62 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
64 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
65 | null alt = error "buildPatterns: ERR2"
67 Pattern (KMP.build alt) (length alt) label
68 --(Terms label $ Set.empty) -- TODO check stems
71 --------------------------------------------------------------------------
73 type MatchedText = Text
74 termsInText :: Patterns -> Text -> [MatchedText]
75 termsInText pats txt = List.nub
78 $ extractTermsWithList pats txt
80 --------------------------------------------------------------------------
82 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
83 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
86 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
87 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
89 extractTermsWithList' :: Patterns -> Text -> [Text]
90 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
93 --------------------------------------------------------------------------
96 filterWith :: TermList
100 filterWith termList f xs = filterWith' termList f zip xs
103 filterWith' :: TermList
105 -> ([a] -> [[Text]] -> [b])
108 filterWith' termList f f' xs = f' xs
109 $ map (extractTermsWithList' pats)
112 pats = buildPatterns termList