Skip to content

Commit a364386

Browse files
committed
add documentation to the major internal implementation of groupby methods
1 parent 82518c0 commit a364386

1 file changed

Lines changed: 116 additions & 5 deletions

File tree

src/danfojs-base/aggregators/groupby.ts

Lines changed: 116 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,50 @@ export default class Groupby {
3939
this.colIndex = colIndex
4040

4141
}
42-
42+
/**
43+
* Generate group object data needed for group operations
44+
* let data = [ [ 1, 2, 3 ], [ 4, 5, 6 ], [ 20, 30, 40 ], [ 39, 89, 78 ] ];
45+
* let cols = [ "A", "B", "C" ];
46+
* let df = new dfd.DataFrame(data, { columns: cols });
47+
* let groupDf = df.groupby([ "A" ]);
48+
* The following internal object is generated and save to this.colDict
49+
* {
50+
* '1': { A: [ 1 ], B: [ 2 ], C: [ 3 ] },
51+
* '4': { A: [ 4 ], B: [ 5 ], C: [ 6 ] },
52+
* '20': { A: [ 20 ], B: [ 30 ], C: [ 40 ] },
53+
* '39': { A: [ 39 ], B: [ 89 ], C: [ 78 ] }
54+
* }
55+
* Since for groupby using more than one columns is index via '-'
56+
* e.g for df.groupby(['A','B'])
57+
* the result will look like this
58+
* {
59+
* '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
60+
* '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
61+
* }
62+
* but in doing analysis on a specific column like this
63+
* df.groupby(['A','B']).col(['C'])
64+
* will have the following set of internal result
65+
* {
66+
* '1-2': { C: [ 3 ]},
67+
* '4-5': {C: [ 6 ]}
68+
* }
69+
* In building our multindex type of DataFrame for this data,
70+
* we've somehow loose track of value for column A and B.
71+
* This could actually be generated by using split('-') on the object keys
72+
* e.g '1-2'.split('-') will give us the value for A and B.
73+
* But we might have weird case scenerio where A and B value has '-`
74+
* e.g
75+
* {
76+
* '1--2-': { C: [ 3 ]},
77+
* '4--5-': {C: [ 6 ]}
78+
* }
79+
* using `.split('-') might not work well
80+
* Hence we create a key-value `keyToValue` object to store index and their
81+
* associated value
82+
* NOTE: In the previous implementation we made use of Graph representation
83+
* for the group by data and Depth First search (DFS). But we decided to use key-value
84+
* object in javascript as an hashmap to reduce search time compared to using Grpah and DFS
85+
*/
4386
group(): Groupby{
4487
const self = this
4588
let keyToValue:{
@@ -78,6 +121,24 @@ export default class Groupby {
78121
return this
79122
}
80123

124+
/**
125+
* Generate new internal groupby data
126+
* group = df.groupby(['A', 'B']).col('C')
127+
* This filter the colDict property as generated by `.group()`
128+
* it filter each group to contain only column `C` in their internal object
129+
* e.g
130+
* {
131+
* '1-2': {A: [ 1 ], B: [ 2 ], C: [ 3 ]},
132+
* '4-5': {A: [ 4 ], B: [ 5 ], C: [ 6 ]}
133+
* }
134+
* to
135+
* {
136+
* '1-2': { C: [ 3 ]},
137+
* '4-5': {C: [ 6 ]}
138+
* }
139+
* @param colNames column names
140+
* @return Groupby
141+
*/
81142
col(colNames: ArrayType1D | undefined): Groupby {
82143

83144
if (typeof colNames === "undefined") {
@@ -114,7 +175,47 @@ export default class Groupby {
114175
return gp
115176
}
116177

117-
arithemetic(operation: {[key: string] : Array<string> | string} | string): { [key: string ]: {} } {
178+
/**
179+
* Perform all groupby arithmetic operations
180+
* In the previous implementation all groups data are
181+
* stord as DataFrame, which involve lot of memory usage
182+
* Hence each groups are just pure javascrit object
183+
* and all arithmetic operation is done directly on javascript
184+
* arrays.
185+
* e.g
186+
* using this internal data
187+
* {
188+
* '1-2': {A: [ 1,3 ], B: [ 2,5 ], C: [ 3, 5 ]},
189+
* '4-5': {A: [ 4,1 ], B: [ 5,0 ], C: [ 6, 12 ]}
190+
* }
191+
* 1) using groupby(['A', 'B']).arithmetic("mean")
192+
* result: * {
193+
* '1-2': {A_mean: [ 2 ], B_mean: [ 3.5 ], C_mean: [ 4 ]},
194+
* '4-5': {A_mean: [ 2.5 ], B: [ 2.5 ], C_mean: [ 9 ]}
195+
* }
196+
* 2) .arithmetic({
197+
* A: 'mean',
198+
* B: 'sum',
199+
* C: 'min'
200+
* })
201+
* result:
202+
* {
203+
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ]},
204+
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ]}
205+
* }
206+
* 3) .arithmetic({
207+
* A: 'mean',
208+
* B: 'sum',
209+
* C: ['min', 'max']
210+
* })
211+
* result:
212+
* {
213+
* '1-2': {A_mean: [ 2 ], B_sum: [ 7 ], C_min: [ 3 ], C_max: [5]},
214+
* '4-5': {A_mean: [ 2.5 ], B_sum: [ 5 ], C_min: [ 6 ], C_max: [12]}
215+
* }
216+
* @param operation
217+
*/
218+
private arithemetic(operation: {[key: string] : Array<string> | string} | string): { [key: string ]: {} } {
118219

119220
const opsName = [ "mean", "sum", "count", "mode", "std", "var", "cumsum", "cumprod",
120221
"cummax", "cummin", "median" , "min"];
@@ -173,7 +274,12 @@ export default class Groupby {
173274
return colDict
174275
}
175276

176-
groupMathLog(colVal: Array<number>, ops: string): Array<number>{
277+
/**
278+
* Peform all arithmetic logic
279+
* @param colVal
280+
* @param ops
281+
*/
282+
private groupMathLog(colVal: Array<number>, ops: string): Array<number>{
177283
let data = []
178284
switch(ops) {
179285
case "max":
@@ -261,7 +367,12 @@ export default class Groupby {
261367
return data
262368
}
263369

264-
toDataFrame(colDict: { [key: string ]: {} }): DataFrame {
370+
/**
371+
* Takes in internal groupby internal data and convert
372+
* them to a single data frame.
373+
* @param colDict
374+
*/
375+
private toDataFrame(colDict: { [key: string ]: {} }): DataFrame {
265376
let data: { [key: string ]: ArrayType1D } = {}
266377

267378
for(let key of this.colKeyDict(colDict)) {
@@ -287,7 +398,7 @@ export default class Groupby {
287398
return new DataFrame(data)
288399
}
289400

290-
operations(ops: string): DataFrame {
401+
private operations(ops: string): DataFrame {
291402
if (!this.groupColNames) {
292403
let colGroup = this.col(undefined)
293404
let colDict = colGroup.arithemetic(ops)

0 commit comments

Comments
 (0)