]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Core/Text/Terms/WithList.hs
[pubmed] some api_key pubmed work
[gargantext.git] / src / Gargantext / Core / Text / Terms / WithList.hs
1 {-|
2 Module : Gargantext.Core.Text.Terms.WithList
3 Description :
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 Here is a longer description of this module, containing some
11 commentary with @some markup@.
12
13 -}
14 {-# LANGUAGE BangPatterns #-}
15
16 module Gargantext.Core.Text.Terms.WithList where
17
18 import Data.List (null)
19 import Data.Ord
20 import Data.Text (Text, concat, unwords)
21 import Gargantext.Prelude
22 import Gargantext.Core.Text.Context
23 import Gargantext.Core.Text.Terms.Mono (monoTextsBySentence)
24 import Prelude (error)
25 import qualified Data.Algorithms.KMP as KMP
26 import qualified Data.IntMap.Strict as IntMap
27 import qualified Data.List as List
28 ------------------------------------------------------------------------
29
30 data Pattern = Pattern
31 { _pat_table :: !(KMP.Table Text)
32 , _pat_length :: !Int
33 , _pat_terms :: ![Text]
34 }
35 type Patterns = [Pattern]
36
37 ------------------------------------------------------------------------
38 replaceTerms :: Patterns -> [Text] -> [[Text]]
39 replaceTerms pats terms = go 0
40 where
41 terms_len = length terms
42
43 go ix | ix >= terms_len = []
44 | otherwise =
45 case IntMap.lookup ix m of
46 Nothing -> go (ix + 1)
47 Just (len, term) ->
48 term : go (ix + len)
49
50
51 merge (len1, lab1) (len2, lab2) =
52 if len2 < len1 then (len1, lab1) else (len2, lab2)
53
54 m =
55 IntMap.fromListWith merge
56 [ (ix, (len, term))
57 | Pattern pat len term <- pats, ix <- KMP.match pat terms ]
58
59 buildPatterns :: TermList -> Patterns
60 buildPatterns = sortWith (Down . _pat_length) . concatMap buildPattern
61 where
62 buildPattern (label, alts) = map f $ map (\alt -> filter (/= "") alt) (label : alts)
63 where
64 f alt | "" `elem` alt = error ("buildPatterns: ERR1" <> show(label))
65 | null alt = error "buildPatterns: ERR2"
66 | otherwise =
67 Pattern (KMP.build alt) (length alt) label
68 --(Terms label $ Set.empty) -- TODO check stems
69
70
71 --------------------------------------------------------------------------
72 -- Utils
73 type MatchedText = Text
74 termsInText :: Patterns -> Text -> [MatchedText]
75 termsInText pats txt = List.nub
76 $ List.concat
77 $ map (map unwords)
78 $ extractTermsWithList pats txt
79
80 --------------------------------------------------------------------------
81
82 extractTermsWithList :: Patterns -> Text -> Corpus [Text]
83 extractTermsWithList pats = map (replaceTerms pats) . monoTextsBySentence
84
85 -- | Extract terms
86 -- >>> let termList = [(["chat blanc"], [["chat","blanc"]])] :: TermList
87 -- extractTermsWithList' (buildPatterns termList) "Le chat blanc"["chat blanc"]
88 -- ["chat blanc"]
89 extractTermsWithList' :: Patterns -> Text -> [Text]
90 extractTermsWithList' pats = map (concat . map concat . replaceTerms pats)
91 . monoTextsBySentence
92
93 --------------------------------------------------------------------------
94
95 {- | Not used
96 filterWith :: TermList
97 -> (a -> Text)
98 -> [a]
99 -> [(a, [Text])]
100 filterWith termList f xs = filterWith' termList f zip xs
101
102
103 filterWith' :: TermList
104 -> (a -> Text)
105 -> ([a] -> [[Text]] -> [b])
106 -> [a]
107 -> [b]
108 filterWith' termList f f' xs = f' xs
109 $ map (extractTermsWithList' pats)
110 $ map f xs
111 where
112 pats = buildPatterns termList
113 -}