]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms.hs
[DB/FACT] Gargantext.Database.Prelude
[gargantext.git] / src / Gargantext / Text / Terms.hs
1 {-|
2 Module : Gargantext.Text.Ngrams
3 Description : Ngrams definition and tools
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 An @n-gram@ is a contiguous sequence of n items from a given sample of
11 text. In Gargantext application the items are words, n is a non negative
12 integer.
13
14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
17 "four-gram", "five-gram", and so on.
18
19 Source: https://en.wikipedia.org/wiki/Ngrams
20
21 TODO
22 group Ngrams -> Tree
23 compute occ by node of Tree
24 group occs according groups
25
26 compute cooccurrences
27 compute graph
28
29 -}
30
31 {-# LANGUAGE NoImplicitPrelude #-}
32 {-# LANGUAGE OverloadedStrings #-}
33 {-# LANGUAGE DeriveGeneric #-}
34 {-# LANGUAGE TemplateHaskell #-}
35 {-# LANGUAGE RankNTypes #-}
36 {-# LANGUAGE FlexibleContexts #-}
37 {-# LANGUAGE ConstrainedClassMethods #-}
38
39 module Gargantext.Text.Terms
40 where
41
42 import Control.Lens
43 import Data.Map (Map)
44 import qualified Data.Map as Map
45 import Data.Text (Text)
46 import Data.Traversable
47 import GHC.Base (String)
48 import GHC.Generics (Generic)
49 import Gargantext.Core
50 import Gargantext.Core.Types
51 import Gargantext.Core.Flow.Types
52 import Gargantext.Prelude
53 import Gargantext.Text (sentences, HasText(..))
54 import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
55 import Gargantext.Database.Schema.Ngrams (Ngrams(..), NgramsType(..))
56 import Gargantext.Text.Terms.Mono (monoTerms)
57 import Gargantext.Database.Prelude (Cmd)
58 import Gargantext.Text.Terms.Mono.Stem (stem)
59 import Gargantext.Text.Terms.Mono.Token.En (tokenize)
60 import Gargantext.Text.Terms.Multi (multiterms)
61 import qualified Data.List as List
62 import qualified Data.Set as Set
63 import qualified Data.Text as Text
64
65 data TermType lang
66 = Mono { _tt_lang :: !lang }
67 | Multi { _tt_lang :: !lang }
68 | MonoMulti { _tt_lang :: !lang }
69 | Unsupervised { _tt_lang :: !lang
70 , _tt_windowSize :: !Int
71 , _tt_ngramsSize :: !Int
72 , _tt_model :: !(Maybe (Tries Token ()))
73 }
74 deriving Generic
75
76 makeLenses ''TermType
77 --group :: [Text] -> [Text]
78 --group = undefined
79
80 -- remove Stop Words
81 -- map (filter (\t -> not . elem t)) $
82 ------------------------------------------------------------------------
83 -- | Sugar to extract terms from text (hiddeng mapM from end user).
84 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
85 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
86
87 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
88 where
89 m' = case m of
90 Just m''-> m''
91 Nothing -> newTries n (Text.intercalate " " xs)
92
93 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
94
95
96 ------------------------------------------------------------------------
97 withLang :: HasText a
98 => TermType Lang
99 -> [DocumentWithId a]
100 -> TermType Lang
101 withLang (Unsupervised l n s m) ns = Unsupervised l n s m'
102 where
103 m' = case m of
104 Nothing -> -- trace ("buildTries here" :: String)
105 Just $ buildTries n ( fmap toToken
106 $ uniText
107 $ Text.intercalate " . "
108 $ List.concat
109 $ map hasText ns
110 )
111 just_m -> just_m
112 withLang l _ = l
113 ------------------------------------------------------------------------
114
115
116 class ExtractNgramsT h
117 where
118 extractNgramsT :: HasText h
119 => TermType Lang
120 -> h
121 -> Cmd err (Map Ngrams (Map NgramsType Int))
122
123 filterNgramsT :: Int -> Map Ngrams (Map NgramsType Int)
124 -> Map Ngrams (Map NgramsType Int)
125 filterNgramsT s ms = Map.fromList $ map (\a -> filter' s a) $ Map.toList ms
126 where
127 filter' s' (ng@(Ngrams t n),y) = case (Text.length t) < s' of
128 True -> (ng,y)
129 False -> (Ngrams (Text.take s' t) n , y)
130
131
132 -- =======================================================
133
134 -- | Terms from Text
135 -- Mono : mono terms
136 -- Multi : multi terms
137 -- MonoMulti : mono and multi
138 -- TODO : multi terms should exclude mono (intersection is not empty yet)
139 terms :: TermType Lang -> Text -> IO [Terms]
140 terms (Mono lang) txt = pure $ monoTerms lang txt
141 terms (Multi lang) txt = multiterms lang txt
142 terms (MonoMulti lang) txt = terms (Multi lang) txt
143 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
144 where
145 m' = maybe (newTries n txt) identity m
146 -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
147 ------------------------------------------------------------------------
148
149 text2term :: Lang -> [Text] -> Terms
150 text2term _ [] = Terms [] Set.empty
151 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
152
153 isPunctuation :: Text -> Bool
154 isPunctuation x = List.elem x $ (Text.pack . pure)
155 <$> ("!?(),;." :: String)
156
157 -- | Unsupervised ngrams extraction
158 -- language agnostic extraction
159 -- TODO: remove IO
160 -- TODO: newtype BlockText
161
162 type WindowSize = Int
163 type MinNgramSize = Int
164
165 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
166 termsUnsupervised (Unsupervised l n s m) =
167 pure
168 . map (text2term l)
169 . List.nub
170 . (List.filter (\l' -> List.length l' >= s))
171 . List.concat
172 . mainEleveWith (maybe (panic "no model") identity m) n
173 . uniText
174 termsUnsupervised _ = undefined
175
176 newTries :: Int -> Text -> Tries Token ()
177 newTries n t = buildTries n (fmap toToken $ uniText t)
178
179 -- | TODO removing long terms > 24
180 uniText :: Text -> [[Text]]
181 uniText = map (List.filter (not . isPunctuation))
182 . map tokenize
183 . sentences -- | TODO get sentences according to lang
184 . Text.toLower
185