]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms/WithList.hs
[docker] update image, add README info
[gargantext.git] / src / Gargantext / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE NoImplicitPrelude #-}
15 {-# LANGUAGE OverloadedStrings #-}
16 {-# LANGUAGE BangPatterns #-}
17
18 module Gargantext.Text.Terms.WithList where
19
20 import Data.List (null, concatMap)
21 import Data.Ord
22 import Data.Text (Text, concat)
23 import Gargantext.Prelude
24 import Gargantext.Text.Context
25 import Gargantext.Text.Terms.Mono (monoTextsBySentence)
26 import Prelude (error)
27 import qualified Data.Algorithms.KMP as KMP
28 import qualified Data.IntMap.Strict as IntMap
29
30 ------------------------------------------------------------------------
31
32 data Pattern = Pattern
33 { _pat_table :: !(KMP.Table Text)
34 , _pat_length :: !Int
35 , _pat_terms :: ![Text]
36 }
37 type Patterns = [Pattern]
38
39 ------------------------------------------------------------------------
40 replaceTerms :: Patterns -> [Text] -> [[Text]]
41 replaceTerms pats terms = go 0
42 where
43 terms_len = length terms
44
45 go ix | ix >= terms_len = []
46 | otherwise =
47 case IntMap.lookup ix m of
48 Nothing -> go (ix + 1)
49 Just (len, term) ->
50 term : go (ix + len)
51
52
53 merge (len1, lab1) (len2, lab2) =
54 if len2 < len1 then (len1, lab1) else (len2, lab2)
55
56 m =
57 IntMap.fromListWith merge
58 [ (ix, (len, term))
59 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
60
61 buildPatterns :: TermList -> Patterns
62 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
63 where
64 buildPattern (label, alts) = map f (label : alts)
65 where
66 f alt | "" `elem` alt = error "buildPatterns: ERR1"
67 | null alt = error "buildPatterns: ERR2"
68 | otherwise =
69 Pattern (KMP.build alt) (length alt) label
70 --(Terms label $ Set.empty) -- TODO check stems
71
72 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
73 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
74
75 -- | Extract terms
76 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
77 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
78 -- ["chat blanc"]
79 extractTermsWithList' :: Patterns -> Text -> [Text]
80 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
81 . monoTextsBySentence
82
83
84 filterWith :: TermList
85 -> (a -> Text)
86 -> [a]
87 -> [(a, [Text])]
88 filterWith termList f xs = filterWith' termList f zip xs
89
90
91 filterWith' :: TermList
92 -> (a -> Text)
93 -> ([a] -> [[Text]] -> [b])
94 -> [a]
95 -> [b]
96 filterWith' termList f f' xs = f' xs
97 $ map (extractTermsWithList' pats)
98 $ map f xs
99 where
100 pats = buildPatterns termList
101