]> Git — Sourcephile - gargantext.git/blob - src/Gargantext/Text/List.hs
[PARSERS] RIS (WIP).
[gargantext.git] / src / Gargantext / Text / List.hs
1 {-|
2 Module : Gargantext.Text.Ngrams.Lists
3 Description : Tools to build lists
4 Copyright : (c) CNRS, 2017-Present
5 License : AGPL + CECILL v3
6 Maintainer : team@gargantext.org
7 Stability : experimental
8 Portability : POSIX
9
10 -}
11
12 {-# LANGUAGE NoImplicitPrelude #-}
13 {-# LANGUAGE OverloadedStrings #-}
14 {-# LANGUAGE RankNTypes #-}
15
16 module Gargantext.Text.List
17 where
18
19 import Data.Map (Map)
20 import Data.Set (Set)
21 import Data.Text (Text)
22 import Gargantext.API.Ngrams (NgramsElement, mkNgramsElement, RootParent(..), mSetFromList)
23 import Gargantext.Core (Lang(..))
24 import Gargantext.Core.Types (ListType(..), MasterCorpusId, UserCorpusId)
25 import Gargantext.Database.Metrics.NgramsByNode (getTficf', sortTficf, ngramsGroup, getNodesByNgramsUser, groupNodesByNgramsWith)
26 import Gargantext.Database.Schema.Ngrams (NgramsType(..))
27 import Gargantext.Database.Utils (Cmd)
28 import Gargantext.Text.List.Learn (Model(..))
29 import Gargantext.Prelude
30 --import Gargantext.Text.Terms (TermType(..))
31 import qualified Data.Char as Char
32 import qualified Data.List as List
33 import qualified Data.Map as Map
34 import qualified Data.Set as Set
35 import qualified Data.Text as Text
36
37
38 data NgramsListBuilder = BuilderStepO { stemSize :: Int
39 , stemX :: Int
40 , stopSize :: Int
41 }
42 | BuilderStep1 { withModel :: Model }
43 | BuilderStepN { withModel :: Model }
44
45
46
47 data StopSize = StopSize {unStopSize :: Int}
48
49 -- | TODO improve grouping functions of Authors, Sources, Institutes..
50 buildNgramsLists :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
51 -> Cmd err (Map NgramsType [NgramsElement])
52 buildNgramsLists l n m s uCid mCid = do
53 ngTerms <- buildNgramsTermsList l n m s uCid mCid
54 othersTerms <- mapM (buildNgramsOthersList uCid identity) [Authors, Sources, Institutes]
55 pure $ Map.unions $ othersTerms <> [ngTerms]
56
57
58 buildNgramsOthersList :: UserCorpusId -> (Text -> Text) -> NgramsType
59 -> Cmd err (Map NgramsType [NgramsElement])
60 buildNgramsOthersList uCid groupIt nt = do
61 ngs <- groupNodesByNgramsWith groupIt <$> getNodesByNgramsUser uCid nt
62
63 pure $ Map.fromList [(nt, [ mkNgramsElement t CandidateTerm Nothing (mSetFromList [])
64 | (t,_ns) <- Map.toList ngs
65 ]
66 )
67 ]
68
69 -- TODO remove hard coded parameters
70 buildNgramsTermsList :: Lang -> Int -> Int -> StopSize -> UserCorpusId -> MasterCorpusId
71 -> Cmd err (Map NgramsType [NgramsElement])
72 buildNgramsTermsList l n m s uCid mCid = do
73 candidates <- sortTficf <$> getTficf' uCid mCid NgramsTerms (ngramsGroup l n m)
74 --printDebug "candidate" (length candidates)
75
76 let termList = toTermList ((isStopTerm s) . fst) candidates
77 --let termList = toTermList ((\_ -> False) . fst) candidates
78 --printDebug "termlist" (length termList)
79
80 let ngs = List.concat $ map toNgramsElement termList
81
82 pure $ Map.fromList [(NgramsTerms, ngs)]
83
84
85 toNgramsElement :: (ListType, (Text, (Double, Set Text))) -> [NgramsElement]
86 toNgramsElement (listType, (_stem, (_score, setNgrams))) =
87 case Set.toList setNgrams of
88 [] -> []
89 (parent:children) -> [parentElem] <> childrenElems
90 where
91 parentElem = mkNgramsElement parent
92 listType
93 Nothing
94 (mSetFromList children)
95 childrenElems = map (\t -> mkNgramsElement t listType
96 (Just $ RootParent parent parent)
97 (mSetFromList [])
98 ) children
99
100 -- TODO remove hard coded parameters
101 toTermList :: (a -> Bool) -> [a] -> [(ListType, a)]
102 toTermList stop ns = map (toTermList' stop CandidateTerm) xs
103 <> map (toTermList' stop GraphTerm) ys
104 <> map (toTermList' stop CandidateTerm) zs
105 where
106 toTermList' stop' l n = case stop' n of
107 True -> (StopTerm, n)
108 False -> (l, n)
109
110 -- TODO use % of size of list
111 -- TODO user ML
112 xs = take a ns
113 ys = take b $ drop a ns
114 zs = drop b $ drop a ns
115
116 a = 3
117 b = 500
118
119 isStopTerm :: StopSize -> Text -> Bool
120 isStopTerm (StopSize n) x = Text.length x < n || any isStopChar (Text.unpack x)
121 where
122 isStopChar c = not (c `elem` ("- /()%" :: [Char]) || Char.isAlpha c)