2 Module : Gargantext.Text.Terms.WithList
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE BangPatterns #-}
17 module Gargantext.Text.Terms.WithList where
19 import qualified Data.Algorithms.KMP as KMP
20 import Data.Char (isSpace)
21 import qualified Data.Text as T
22 import Data.Text (Text)
23 import qualified Data.IntMap.Strict as IntMap
24 import Gargantext.Prelude
25 import Data.List (concatMap)
31 type Pattern = KMP.Table Term
33 type TermList = [(Label, [[Term]])]
35 type Patterns = [(Pattern, Int, Label)]
37 isMultiTermSep :: Char -> Bool
38 isMultiTermSep = (`elem` ",.:;?!(){}[]")
40 type Sentence a = [a] -- or a nominal group
41 type Corpus a = [Sentence a] -- a list of sentences
43 replaceTerms :: Patterns -> Sentence Term -> Sentence Label
44 replaceTerms pats terms = go 0 terms
48 case IntMap.lookup ix m of
49 Nothing -> t : go (ix + 1) ts
51 label : go (ix + len) (drop (len - 1) ts)
53 -- TODO is it what we want?
54 merge (len1, lab1) (len2, lab2) =
55 if len1 > len2 then (len1, lab1) else (len2, lab2)
58 IntMap.fromListWith merge
60 | (pat, len, label) <- pats, ix <- KMP.match pat terms ]
62 buildPatterns :: TermList -> Patterns
63 buildPatterns = concatMap buildPattern
65 buildPattern (label, alts) = map f alts
67 f alt = (KMP.build alt, length alt, label)
69 -- monoterms'' :: Lang -> Text -> [Terms]
70 -- monoterms'' l txt = map (text2terms l) $ monoterms txt
72 extractTermsWithList :: Patterns -> Text -> Corpus Label
73 extractTermsWithList pats =
74 map (replaceTerms pats) .
75 map (T.split isSpace) . -- text2terms
76 T.split isMultiTermSep . T.toLower -- as in monoterms with a different list of seps