]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/WithList.hs
[BIN/PHYLO] first draft, needs phylo connection.
[gargantext.git] / src / Gargantext / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE BangPatterns #-}
17
18 module Gargantext.Text.Terms.WithList where
19
20 import qualified Data.Algorithms.KMP as KMP
21 import Data.Text (Text, concat, unpack)
22 import qualified Data.IntMap.Strict as IntMap
23
24 import Gargantext.Text.Context
25 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
26
27 import Prelude (error)
28 import Gargantext.Prelude
29 import Data.List (null, concatMap)
30 import Data.Ord
31
32
33 ------------------------------------------------------------------------
34
35 data Pattern = Pattern
36 { _pat_table :: !(KMP.Table Text)
37 , _pat_length :: !Int
38 , _pat_terms :: ![Text]
39 }
40 type Patterns = [Pattern]
41
42 ------------------------------------------------------------------------
43
44 replaceTerms :: Patterns -> [Text] -> [[Text]]
45 replaceTerms pats terms = go 0
46 where
47 terms_len = length terms
48
49 go ix | ix >= terms_len = []
50 | otherwise =
51 case IntMap.lookup ix m of
52 Nothing -> go (ix + 1)
53 Just (len, term) ->
54 term : go (ix + len)
55
56
57 merge (len1, lab1) (len2, lab2) =
58 if len2 < len1 then (len1, lab1) else (len2, lab2)
59
60 m =
61 IntMap.fromListWith merge
62 [ (ix, (len, term))
63 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
64
65 buildPatterns :: TermList -> Patterns
66 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
67 where
68 buildPattern (label, alts) = map f (label : alts)
69 where
70 f alt | "" `elem` alt = error "buildPatterns: ERR1"
71 | null alt = error "buildPatterns: ERR2"
72 | otherwise =
73 Pattern (KMP.build alt) (length alt) label
74 --(Terms label $ Set.empty) -- TODO check stems
75
76 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
77 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
78
79 -- | Extract terms
80 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
81 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
82 -- ["chat blanc"]
83 extractTermsWithList' :: Patterns -> Text -> [Text]
84 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats) . monoTextsBySentence
85
86