]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/WithList.hs
Merge branch 'dev' into 145-dev-graph-explorer-search-tfidf
[gargantext.git] / src / Gargantext / Core / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Core.Text.Terms.WithList where
17
18 import Data.List (null)
19 import Data.Ord
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Gargantext.Core.Types (TermsCount)
25 import Gargantext.Core.Utils (groupWithCounts)
26 import Prelude (error)
27 import qualified Data.Algorithms.KMP as KMP
28 import qualified Data.IntMap.Strict as IntMap
29 import qualified Data.List as List
30 ------------------------------------------------------------------------
31
32 data Pattern = Pattern
33 { _pat_table :: !(KMP.Table Text)
34 , _pat_length :: !Int
35 , _pat_terms :: ![Text]
36 }
37 type Patterns = [Pattern]
38
39 ------------------------------------------------------------------------
40 replaceTerms :: Patterns -> [Text] -> [[Text]]
41 replaceTerms pats terms = go 0
42 where
43 terms_len = length terms
44
45 go ix | ix >= terms_len = []
46 | otherwise =
47 case IntMap.lookup ix m of
48 Nothing -> go (ix + 1)
49 Just (len, term) ->
50 term : go (ix + len)
51
52
53 merge (len1, lab1) (len2, lab2) =
54 if len2 < len1 then (len1, lab1) else (len2, lab2)
55
56 m =
57 IntMap.fromListWith merge
58 [ (ix, (len, term))
59 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
60
61 buildPatterns :: TermList -> Patterns
62 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
63 where
64 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
65 where
66 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
67 | null alt = error "buildPatterns: ERR2"
68 | otherwise =
69 Pattern (KMP.build alt) (length alt) label
70 --(Terms label $ Set.empty) -- TODO check stems
71
72
73 --------------------------------------------------------------------------
74 -- Utils
75 type MatchedText = Text
76 termsInText :: Patterns -> Text -> [(MatchedText, TermsCount)]
77 termsInText pats txt = groupWithCounts
78 $ List.concat
79 $ map (map unwords)
80 $ extractTermsWithList pats txt
81
82 --------------------------------------------------------------------------
83
84 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
85 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
86
87 -- | Extract terms
88 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
89 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
90 -- ["chat blanc"]
91 extractTermsWithList' :: Patterns -> Text -> [Text]
92 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
93 . monoTextsBySentence
94
95 --------------------------------------------------------------------------
96
97 {- | Not used
98 filterWith :: TermList
99 -> (a -> Text)
100 -> [a]
101 -> [(a, [Text])]
102 filterWith termList f xs = filterWith' termList f zip xs
103
104
105 filterWith' :: TermList
106 -> (a -> Text)
107 -> ([a] -> [[Text]] -> [b])
108 -> [a]
109 -> [b]
110 filterWith' termList f f' xs = f' xs
111 $ map (extractTermsWithList' pats)
112 $ map f xs
113 where
114 pats = buildPatterns termList
115 -}