2 Module : Gargantext.Core.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.Terms.WithList where
18 import Data.List (null)
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Prelude (error)
25 import qualified Data.Algorithms.KMP as KMP
26 import qualified Data.IntMap.Strict as IntMap
27 import qualified Data.List as List
28 ------------------------------------------------------------------------
30 data Pattern = Pattern
31 { _pat_table :: !(KMP.Table Text)
33 , _pat_terms :: ![Text]
35 type Patterns = [Pattern]
37 ------------------------------------------------------------------------
38 replaceTerms :: Patterns -> [Text] -> [[Text]]
39 replaceTerms pats terms = go 0
41 terms_len = length terms
43 go ix | ix >= terms_len = []
45 case IntMap.lookup ix m of
46 Nothing -> go (ix + 1)
51 merge (len1, lab1) (len2, lab2) =
52 if len2 < len1 then (len1, lab1) else (len2, lab2)
55 IntMap.fromListWith merge
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
59 buildPatterns :: TermList -> Patterns
60 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
62 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
64 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
65 | null alt = error "buildPatterns: ERR2"
67 Pattern (KMP.build alt) (length alt) label
68 --(Terms label $ Set.empty) -- TODO check stems
71 --------------------------------------------------------------------------
74 type MatchedText = Text
75 termsInText :: Patterns -> BlockText -> [MatchedText]
76 termsInText pats txt = List.nub
79 $ extractTermsWithList pats txt
81 --------------------------------------------------------------------------
83 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
84 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
87 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
88 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
90 extractTermsWithList' :: Patterns -> Text -> [Text]
91 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
94 --------------------------------------------------------------------------
97 filterWith :: TermList
101 filterWith termList f xs = filterWith' termList f zip xs
104 filterWith' :: TermList
106 -> ([a] -> [[Text]] -> [b])
109 filterWith' termList f f' xs = f' xs
110 $ map (extractTermsWithList' pats)
113 pats = buildPatterns termList