]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/Terms.hs
[API|Query] WIP need to fit query with frontend
[gargantext.git] / src / Gargantext / Text / Terms.hs
1 {-|
2 Module : Gargantext.Text.Ngrams
3 Description : Ngrams definition and tools
4 Copyright : (c) CNRS, 2017 - present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 An @n-gram@ is a contiguous sequence of n items from a given sample of
11 text. In Gargantext application the items are words, n is a non negative
12 integer.
13
14 Using Latin numerical prefixes, an n-gram of size 1 is referred to as a
15 "unigram"; size 2 is a "bigram" (or, less commonly, a "digram"); size
16 3 is a "trigram". English cardinal numbers are sometimes used, e.g.,
17 "four-gram", "five-gram", and so on.
18
19 Source: https://en.wikipedia.org/wiki/Ngrams
20
21 TODO
22 group Ngrams -> Tree
23 compute occ by node of Tree
24 group occs according groups
25
26 compute cooccurrences
27 compute graph
28
29 -}
30
31 {-# LANGUAGE NoImplicitPrelude #-}
32 {-# LANGUAGE OverloadedStrings #-}
33 {-# LANGUAGE DeriveGeneric #-}
34 {-# LANGUAGE TemplateHaskell #-}
35
36 module Gargantext.Text.Terms
37 where
38
39 import Control.Lens
40 import Data.Aeson.TH (deriveJSON)
41 import Data.Swagger
42 import Data.Text (Text)
43 import Data.Traversable
44 import GHC.Base (String)
45 import GHC.Generics (Generic)
46 import Gargantext.Core.Utils.Prefix (unPrefix, unPrefixSwagger)
47 import Gargantext.Core
48 import Gargantext.Core.Types
49 import Gargantext.Prelude
50 import Gargantext.Text (sentences)
51 import Gargantext.Text.Terms.Eleve (mainEleveWith, Tries, Token, buildTries, toToken)
52 import Gargantext.Text.Terms.Mono (monoTerms)
53 import Gargantext.Text.Terms.Mono.Stem (stem)
54 import Gargantext.Text.Terms.Mono.Token.En (tokenize)
55 import Gargantext.Text.Terms.Multi (multiterms)
56 import qualified Data.List as List
57 import qualified Data.Set as Set
58 import qualified Data.Text as Text
59
60 data TermType lang
61 = Mono { _tt_lang :: lang }
62 | Multi { _tt_lang :: lang }
63 | MonoMulti { _tt_lang :: lang }
64 | Unsupervised { _tt_lang :: lang
65 , _tt_windowSize :: Int
66 , _tt_ngramsSize :: Int
67 , _tt_model :: Maybe (Tries Token ())
68 }
69 deriving Generic
70
71 makeLenses ''TermType
72 --group :: [Text] -> [Text]
73 --group = undefined
74
75 -- remove Stop Words
76 -- map (filter (\t -> not . elem t)) $
77 ------------------------------------------------------------------------
78 -- | Sugar to extract terms from text (hiddeng mapM from end user).
79 --extractTerms :: Traversable t => TermType Lang -> t Text -> IO (t [Terms])
80 extractTerms :: TermType Lang -> [Text] -> IO [[Terms]]
81
82 extractTerms (Unsupervised l n s m) xs = mapM (terms (Unsupervised l n s (Just m'))) xs
83 where
84 m' = case m of
85 Just m''-> m''
86 Nothing -> newTries n (Text.intercalate " " xs)
87
88 extractTerms termTypeLang xs = mapM (terms termTypeLang) xs
89
90 ------------------------------------------------------------------------
91 -- | Terms from Text
92 -- Mono : mono terms
93 -- Multi : multi terms
94 -- MonoMulti : mono and multi
95 -- TODO : multi terms should exclude mono (intersection is not empty yet)
96 terms :: TermType Lang -> Text -> IO [Terms]
97 terms (Mono lang) txt = pure $ monoTerms lang txt
98 terms (Multi lang) txt = multiterms lang txt
99 terms (MonoMulti lang) txt = terms (Multi lang) txt
100 terms (Unsupervised lang n s m) txt = termsUnsupervised (Unsupervised lang n s (Just m')) txt
101 where
102 m' = maybe (newTries n txt) identity m
103 -- terms (WithList list) txt = pure . concat $ extractTermsWithList list txt
104 ------------------------------------------------------------------------
105
106 text2term :: Lang -> [Text] -> Terms
107 text2term _ [] = Terms [] Set.empty
108 text2term lang txt = Terms txt (Set.fromList $ map (stem lang) txt)
109
110 isPunctuation :: Text -> Bool
111 isPunctuation x = List.elem x $ (Text.pack . pure)
112 <$> ("!?(),;." :: String)
113
114 -- | Unsupervised ngrams extraction
115 -- language agnostic extraction
116 -- TODO: remove IO
117 -- TODO: newtype BlockText
118
119 type WindowSize = Int
120 type MinNgramSize = Int
121
122 termsUnsupervised :: TermType Lang -> Text -> IO [Terms]
123 termsUnsupervised (Unsupervised l n s m) =
124 pure
125 . map (text2term l)
126 . List.nub
127 . (List.filter (\l' -> List.length l' >= s))
128 . List.concat
129 . mainEleveWith (maybe (panic "no model") identity m) n
130 . uniText
131 termsUnsupervised _ = undefined
132
133 newTries :: Int -> Text -> Tries Token ()
134 newTries n t = buildTries n (fmap toToken $ uniText t)
135
136 -- | TODO removing long terms > 24
137 uniText :: Text -> [[Text]]
138 uniText = map (List.filter (not . isPunctuation))
139 . map tokenize
140 . sentences -- | TODO get sentences according to lang
141 . Text.toLower
142