2 Module : Gargantext.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE BangPatterns #-}
18 module Gargantext.Text.Terms.WithList where
20 import qualified Data.Algorithms.KMP as KMP
21 import Data.Text (Text, concat, unpack)
22 import qualified Data.IntMap.Strict as IntMap
24 import Gargantext.Text.Context
25 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
27 import Prelude (error)
28 import Gargantext.Prelude
29 import Data.List (null, concatMap)
33 ------------------------------------------------------------------------
35 data Pattern = Pattern
36 { _pat_table :: !(KMP.Table Text)
38 , _pat_terms :: ![Text]
40 type Patterns = [Pattern]
42 ------------------------------------------------------------------------
44 replaceTerms :: Patterns -> [Text] -> [[Text]]
45 replaceTerms pats terms = go 0
47 terms_len = length terms
49 go ix | ix >= terms_len = []
51 case IntMap.lookup ix m of
52 Nothing -> go (ix + 1)
57 merge (len1, lab1) (len2, lab2) =
58 if len2 < len1 then (len1, lab1) else (len2, lab2)
61 IntMap.fromListWith merge
63 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
65 buildPatterns :: TermList -> Patterns
66 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
68 buildPattern (label, alts) = map f (label : alts)
70 f alt | "" `elem` alt = error "buildPatterns: ERR1"
71 | null alt = error "buildPatterns: ERR2"
73 Pattern (KMP.build alt) (length alt) label
74 --(Terms label $ Set.empty) -- TODO check stems
76 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
77 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
80 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
81 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
83 extractTermsWithList' :: Patterns -> Text -> [Text]
84 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats) . monoTextsBySentence