2 Module : Gargantext.Core.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE BangPatterns #-}
16 module Gargantext.Core.Text.Terms.WithList where
18 import Data.List (null)
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Gargantext.Core.Types (TermsCount)
25 import Gargantext.Core.Utils (groupWithCounts)
26 import Prelude (error)
27 import qualified Data.Algorithms.KMP as KMP
28 import qualified Data.IntMap.Strict as IntMap
29 import qualified Data.List as List
30 ------------------------------------------------------------------------
32 data Pattern = Pattern
33 { _pat_table :: !(KMP.Table Text)
35 , _pat_terms :: ![Text]
37 type Patterns = [Pattern]
39 ------------------------------------------------------------------------
41 data ReplaceTerms = KeepAll | LongestOnly
43 replaceTerms :: ReplaceTerms -> Patterns -> [Text] -> [[Text]]
44 replaceTerms rplaceTerms pats terms = go 0
46 terms_len = length terms
48 go ix | ix >= terms_len = []
50 case IntMap.lookup ix m of
51 Nothing -> go (ix + 1)
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
59 toMap = case rplaceTerms of
60 KeepAll -> IntMap.fromList
61 LongestOnly -> IntMap.fromListWith merge
63 merge (len1, lab1) (len2, lab2) =
64 if len2 < len1 then (len1, lab1) else (len2, lab2)
66 buildPatterns :: TermList -> Patterns
67 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
69 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
71 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
72 | null alt = error "buildPatterns: ERR2"
74 Pattern (KMP.build alt) (length alt) label
75 --(Terms label $ Set.empty) -- TODO check stems
78 --------------------------------------------------------------------------
80 type MatchedText = Text
81 termsInText :: Patterns -> Text -> [(MatchedText, TermsCount)]
82 termsInText pats txt = groupWithCounts
85 $ extractTermsWithList pats txt
87 --------------------------------------------------------------------------
89 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
90 extractTermsWithList pats = map (replaceTerms KeepAll pats) . monoTextsBySentence
93 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
94 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
96 extractTermsWithList' :: Patterns -> Text -> [Text]
97 extractTermsWithList' pats = map (concat . map concat . replaceTerms KeepAll pats)
100 --------------------------------------------------------------------------
103 filterWith :: TermList
107 filterWith termList f xs = filterWith' termList f zip xs
110 filterWith' :: TermList
112 -> ([a] -> [[Text]] -> [b])
115 filterWith' termList f f' xs = f' xs
116 $ map (extractTermsWithList' pats)
119 pats = buildPatterns termList