]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
Merge branch 'dev' into dev-phylo
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Text.List
17 where
18
19 import Data.Either (partitionEithers, Either(..))
20 import Debug.Trace (trace)
21 import Data.Map (Map)
22 import Data.Set (Set)
23 import Data.Text (Text)
24 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
25 import Gargantext.API.Ngrams.Tools (getCoocByNgrams', Diagonal(..))
26 import Gargantext.Core (Lang(..))
27 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId, NodeId)
28 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
29 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
30 import Gargantext.Database.Utils (Cmd)
31 import Gargantext.Text.List.Learn (Model(..))
32 import Gargantext.Text.Metrics (takeScored)
33 import Gargantext.Prelude
34 --import Gargantext.Text.Terms (TermType(..))
35 import qualified Data.Char as Char
36 import qualified Data.List as List
37 import qualified Data.Map as Map
38 import qualified Data.Set as Set
39 import qualified Data.Text as Text
40
41
42 data NgramsListBuilder = BuilderStepO { stemSize :: Int
43 , stemX :: Int
44 , stopSize :: Int
45 }
46 | BuilderStep1 { withModel :: Model }
47 | BuilderStepN { withModel :: Model }
48 | Tficf { nlb_lang :: Lang
49 , nlb_group1 :: Int
50 , nlb_group2 :: Int
51 , nlb_stopSize :: StopSize
52 , nlb_userCorpusId :: UserCorpusId
53 , nlb_masterCorpusId :: MasterCorpusId
54 }
55
56
57 data StopSize = StopSize {unStopSize :: Int}
58
59 -- | TODO improve grouping functions of Authors, Sources, Institutes..
60 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
61 -> Cmd err (Map NgramsType [NgramsElement])
62 buildNgramsLists l n m s uCid mCid = do
63 ngTerms <- buildNgramsTermsList l n m s uCid mCid
64 --ngTerms <- buildNgramsTermsList' uCid (ngramsGroup l n m) (isStopTerm s . fst) 500 50
65 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
66 pure $ Map.unions $ othersTerms <> [ngTerms]
67
68
69 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
70 -> Cmd err (Map NgramsType [NgramsElement])
71 buildNgramsOthersList uCid groupIt nt = do
72 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
73
74 let
75 all' = Map.toList ngs
76 pure $ (toElements GraphTerm all') <> (toElements CandidateTerm all')
77 --pure $ (toElements GraphTerm $ take 10 all') <> (toElements CandidateTerm $ drop 10 all')
78 where
79 toElements nType x = Map.fromList [(nt, [ mkNgramsElement t nType Nothing (mSetFromList [])
80 | (t,_ns) <- x
81 ]
82 )
83 ]
84
85 --{-
86 buildNgramsTermsList' :: UserCorpusId
87 -> (Text -> Text)
88 -> ((Text, (Set Text, Set NodeId)) -> Bool) -> Int -> Int
89 -> Cmd err (Map NgramsType [NgramsElement])
90 --}
91 buildNgramsTermsList' uCid groupIt stop gls is = do
92 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid NgramsTerms
93
94 let
95 (stops, candidates) = partitionEithers
96 $ map (\t -> if stop t then Left t else Right t)
97 $ Map.toList
98 $ Map.filter ((\s' -> Set.size s' > 1) . snd) ngs
99
100 (maps, candidates') = takeScored gls is
101 $ getCoocByNgrams' snd (Diagonal True)
102 $ Map.fromList candidates
103
104
105 toList' t = (fst t, (fromIntegral $ Set.size $ snd $ snd t, fst $ snd t))
106
107 (s,c,m) = (stops
108 , List.filter (\(k,_) -> List.elem k candidates') candidates
109 , List.filter (\(k,_) -> List.elem k maps) candidates
110 )
111
112 let ngs' = List.concat
113 $ map toNgramsElement
114 $ map (\t -> (StopTerm, toList' t)) s
115 <> map (\t -> (CandidateTerm, toList' t)) c
116 <> map (\t -> (GraphTerm, toList' t)) m
117
118 pure $ Map.fromList [(NgramsTerms, ngs')]
119
120
121 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
122 -> Cmd err (Map NgramsType [NgramsElement])
123 buildNgramsTermsList l n m s uCid mCid = do
124 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
125 let
126 candidatesSize = 2000
127 a = 10
128 b = 10
129 candidatesHead = List.take candidatesSize candidates
130 candidatesTail = List.drop candidatesSize candidates
131 termList = (toTermList a b ((isStopTerm s) . fst) candidatesHead)
132 <> (map (toList ((isStopTerm s) .fst) CandidateTerm) candidatesTail)
133 let ngs = List.concat $ map toNgramsElement termList
134
135 pure $ Map.fromList [(NgramsTerms, ngs)]
136
137
138 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
139 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
140 case Set.toList setNgrams of
141 [] -> []
142 (parent:children) -> [parentElem] <> childrenElems
143 where
144 parentElem = mkNgramsElement parent
145 listType
146 Nothing
147 (mSetFromList children)
148 childrenElems = map (\t -> mkNgramsElement t listType
149 (Just $ RootParent parent parent)
150 (mSetFromList [])
151 ) children
152
153
154 toList :: (b -> Bool) -> ListType -> b -> (ListType, b)
155 toList stop l n = case stop n of
156 True -> (StopTerm, n)
157 False -> (l, n)
158
159
160 toTermList :: Int -> Int -> (a -> Bool) -> [a] -> [(ListType, a)]
161 toTermList _ _ _ [] = []
162 toTermList a b stop ns = trace ("computing toTermList") $
163 map (toList stop CandidateTerm) xs
164 <> map (toList stop GraphTerm) ys
165 <> toTermList a b stop zs
166 where
167 xs = take a ns
168 ta = drop a ns
169
170 ys = take b ta
171 zs = drop b ta
172
173
174 isStopTerm :: StopSize -> Text -> Bool
175 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
176 where
177 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)