forked from simdjson/simdjson
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.cpp
More file actions
688 lines (600 loc) · 26.8 KB
/
main.cpp
File metadata and controls
688 lines (600 loc) · 26.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
#include "linux-perf-events.h"
#include <iostream>
#include <iomanip>
#include <chrono>
#include <fstream>
#include <sstream>
#include <string>
#include <cstring>
#include <vector>
#include <set>
#include <map>
#include <algorithm>
#include <x86intrin.h>
#include <assert.h>
#include "common_defs.h"
using namespace std;
//#define DEBUG
#ifdef DEBUG
inline void dump256(m256 d, string msg) {
for (u32 i = 0; i < 32; i++) {
cout << setw(3) << (int)*(((u8 *)(&d)) + i);
if (!((i+1)%8))
cout << "|";
else if (!((i+1)%4))
cout << ":";
else
cout << " ";
}
cout << " " << msg << "\n";
}
// dump bits low to high
void dumpbits(u64 v, string msg) {
for (u32 i = 0; i < 64; i++) {
std::cout << (((v>>(u64)i) & 0x1ULL) ? "1" : "_");
}
cout << " " << msg << "\n";
}
#else
#define dump256(a,b) ;
#define dumpbits(a,b) ;
#endif
// get a corpus; pad out to cache line so we can always use SIMD
pair<u8 *, size_t> get_corpus(string filename) {
ifstream is(filename, ios::binary);
if (is) {
stringstream buffer;
buffer << is.rdbuf();
size_t length = buffer.str().size();
char * aligned_buffer;
if (posix_memalign( (void **)&aligned_buffer, 64, ROUNDUP_N(length, 64))) {
throw "Allocation failed";
};
memset(aligned_buffer, 0x20, ROUNDUP_N(length, 64));
memcpy(aligned_buffer, buffer.str().c_str(), length);
is.close();
return make_pair((u8 *)aligned_buffer, length);
}
throw "No corpus";
return make_pair((u8 *)0, (size_t)0);
}
struct JsonNode {
u32 next;
u32 next_type;
u64 payload; // a freeform 'payload' holding a parsed representation of *something*
};
struct ParsedJson {
u8 * structurals;
u32 n_structural_indexes;
u32 * structural_indexes;
JsonNode * nodes;
};
// a straightforward comparison of a mask against input. 5 uops; would be cheaper in AVX512.
really_inline u64 cmp_mask_against_input(m256 input_lo, m256 input_hi, m256 mask) {
m256 cmp_res_0 = _mm256_cmpeq_epi8(input_lo, mask);
u64 res_0 = (u32)_mm256_movemask_epi8(cmp_res_0);
m256 cmp_res_1 = _mm256_cmpeq_epi8(input_hi, mask);
u64 res_1 = _mm256_movemask_epi8(cmp_res_1);
return res_0 | (res_1 << 32);
}
never_inline bool find_structural_bits(const u8 * buf, size_t len, ParsedJson & pj) {
// Useful constant masks
const u64 even_bits = 0x5555555555555555ULL;
const u64 odd_bits = ~even_bits;
// for now, just work in 64-byte chunks
// we have padded the input out to 64 byte multiple with the remainder being zeros
// persistent state across loop
u64 prev_iter_ends_odd_backslash = 0ULL; // either 0 or 1, but a 64-bit value
u64 prev_iter_inside_quote = 0ULL; // either all zeros or all ones
u64 prev_iter_ends_pseudo_pred = 0ULL;
for (size_t idx = 0; idx < len; idx+=64) {
#ifdef DEBUG
cout << "Idx is " << idx << "\n";
for (u32 j = 0; j < 64; j++) {
char c = *(buf+idx+j);
if (isprint(c)) {
cout << c;
} else {
cout << '_';
}
}
cout << "| ... input\n";
#endif
m256 input_lo = _mm256_load_si256((const m256 *)(buf + idx + 0));
m256 input_hi = _mm256_load_si256((const m256 *)(buf + idx + 32));
////////////////////////////////////////////////////////////////////////////////////////////
// Step 1: detect odd sequences of backslashes
////////////////////////////////////////////////////////////////////////////////////////////
u64 bs_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('\\'));
dumpbits(bs_bits, "backslash bits");
u64 start_edges = bs_bits & ~(bs_bits << 1);
dumpbits(start_edges, "start_edges");
// flip lowest if we have an odd-length run at the end of the prior iteration
u64 even_start_mask = even_bits ^ prev_iter_ends_odd_backslash;
u64 even_starts = start_edges & even_start_mask;
u64 odd_starts = start_edges & ~even_start_mask;
dumpbits(even_starts, "even_starts");
dumpbits(odd_starts, "odd_starts");
u64 even_carries = bs_bits + even_starts;
u64 odd_carries;
// must record the carry-out of our odd-carries out of bit 63; this indicates whether the
// sense of any edge going to the next iteration should be flipped
bool iter_ends_odd_backslash = __builtin_uaddll_overflow(bs_bits, odd_starts, &odd_carries);
odd_carries |= prev_iter_ends_odd_backslash; // push in bit zero as a potential end
// if we had an odd-numbered run at the end of
// the previous iteration
prev_iter_ends_odd_backslash = iter_ends_odd_backslash ? 0x1ULL : 0x0ULL;
dumpbits(even_carries, "even_carries");
dumpbits(odd_carries, "odd_carries");
u64 even_carry_ends = even_carries & ~bs_bits;
u64 odd_carry_ends = odd_carries & ~bs_bits;
dumpbits(even_carry_ends, "even_carry_ends");
dumpbits(odd_carry_ends, "odd_carry_ends");
u64 even_start_odd_end = even_carry_ends & odd_bits;
u64 odd_start_even_end = odd_carry_ends & even_bits;
dumpbits(even_start_odd_end, "esoe");
dumpbits(odd_start_even_end, "osee");
u64 odd_ends = even_start_odd_end | odd_start_even_end;
dumpbits(odd_ends, "odd_ends");
////////////////////////////////////////////////////////////////////////////////////////////
// Step 2: detect insides of quote pairs
////////////////////////////////////////////////////////////////////////////////////////////
u64 quote_bits = cmp_mask_against_input(input_lo, input_hi, _mm256_set1_epi8('"'));
quote_bits = quote_bits & ~odd_ends;
dumpbits(quote_bits, "quote_bits");
u64 quote_mask = _mm_cvtsi128_si64(_mm_clmulepi64_si128(_mm_set_epi64x(0ULL, quote_bits),
_mm_set1_epi8(0xFF), 0));
quote_mask ^= prev_iter_inside_quote;
prev_iter_inside_quote = (u64)((s64)quote_mask>>63);
dumpbits(quote_mask, "quote_mask");
// How do we build up a user traversable data structure
// first, do a 'shufti' to detect structural JSON characters
// they are { 0x7b } 0x7d : 0x3a [ 0x5b ] 0x5d , 0x2c
// these go into the first 3 buckets of the comparison (1/2/4)
// we are also interested in the four whitespace characters
// space 0x20, linefeed 0x0a, horizontal tab 0x09 and carriage return 0x0d
// these go into the next 2 buckets of the comparison (8/16)
const m256 low_nibble_mask = _mm256_setr_epi8(
// 0 9 a b c d
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0,
16, 0, 0, 0, 0, 0, 0, 0, 0, 8, 12, 1, 2, 9, 0, 0
);
const m256 high_nibble_mask = _mm256_setr_epi8(
// 0 2 3 5 7
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0,
8, 0, 18, 4, 0, 1, 0, 1, 0, 0, 0, 3, 2, 1, 0, 0
);
m256 structural_shufti_mask = _mm256_set1_epi8(0x7);
m256 whitespace_shufti_mask = _mm256_set1_epi8(0x18);
m256 v_lo = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_lo),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_lo, 4), _mm256_set1_epi8(0x7f))));
m256 v_hi = _mm256_and_si256(
_mm256_shuffle_epi8(low_nibble_mask, input_hi),
_mm256_shuffle_epi8(high_nibble_mask,
_mm256_and_si256(_mm256_srli_epi32(input_hi, 4), _mm256_set1_epi8(0x7f))));
m256 tmp_lo = _mm256_cmpeq_epi8(_mm256_and_si256(v_lo, structural_shufti_mask),
_mm256_set1_epi8(0));
m256 tmp_hi = _mm256_cmpeq_epi8(_mm256_and_si256(v_hi, structural_shufti_mask),
_mm256_set1_epi8(0));
u64 structural_res_0 = (u32)_mm256_movemask_epi8(tmp_lo);
u64 structural_res_1 = _mm256_movemask_epi8(tmp_hi);
u64 structurals = ~(structural_res_0 | (structural_res_1 << 32));
// this additional mask and transfer is non-trivially expensive, unfortunately
m256 tmp_ws_lo = _mm256_cmpeq_epi8(_mm256_and_si256(v_lo, whitespace_shufti_mask),
_mm256_set1_epi8(0));
m256 tmp_ws_hi = _mm256_cmpeq_epi8(_mm256_and_si256(v_hi, whitespace_shufti_mask),
_mm256_set1_epi8(0));
u64 ws_res_0 = (u32)_mm256_movemask_epi8(tmp_ws_lo);
u64 ws_res_1 = _mm256_movemask_epi8(tmp_ws_hi);
u64 whitespace = ~(ws_res_0 | (ws_res_1 << 32));
dumpbits(structurals, "structurals");
dumpbits(whitespace, "whitespace");
// mask off anything inside quotes
structurals &= ~quote_mask;
// add the real quote bits back into our bitmask as well, so we can
// quickly traverse the strings we've spent all this trouble gathering
structurals |= quote_bits;
// Now, establish "pseudo-structural characters". These are non-whitespace characters
// that are (a) outside quotes and (b) have a predecessor that's either whitespace or a structural
// character. This means that subsequent passes will get a chance to encounter the first character
// of every string of non-whitespace and, if we're parsing an atom like true/false/null or a number
// we can stop at the first whitespace or structural character following it.
// a qualified predecessor is something that can happen 1 position before an
// psuedo-structural character
u64 pseudo_pred = structurals | whitespace;
dumpbits(pseudo_pred, "pseudo_pred");
u64 shifted_pseudo_pred = (pseudo_pred << 1) | prev_iter_ends_pseudo_pred;
dumpbits(shifted_pseudo_pred, "shifted_pseudo_pred");
prev_iter_ends_pseudo_pred = pseudo_pred >> 63;
u64 pseudo_structurals = shifted_pseudo_pred & (~whitespace) & (~quote_mask);
dumpbits(pseudo_structurals, "pseudo_structurals");
dumpbits(structurals, "final structurals without pseudos");
structurals |= pseudo_structurals;
dumpbits(structurals, "final structurals and pseudo structurals");
*(u64 *)(pj.structurals + idx/8) = structurals;
}
return true;
}
const u32 NUM_RESERVED_NODES = 2;
const u32 DUMMY_NODE = 0;
const u32 ROOT_NODE = 1;
// just transform the bitmask to a big list of 32-bit integers for now
// that's all; the type of character the offset points to will
// tell us exactly what we need to know. Naive but straightforward implementation
never_inline bool flatten_indexes(size_t len, ParsedJson & pj) {
u32 base = NUM_RESERVED_NODES;
u32 * base_ptr = pj.structural_indexes;
base_ptr[DUMMY_NODE] = base_ptr[ROOT_NODE] = 0; // really shouldn't matter
for (size_t idx = 0; idx < len; idx+=64) {
u64 s = *(u64 *)(pj.structurals + idx/8);
#ifdef SUPPRESS_CHEESY_FLATTEN
while (s) {
base_ptr[base++] = (u32)idx + __builtin_ctzll(s); s &= s - 1ULL;
}
#else
u32 cnt = __builtin_popcountll(s);
u32 next_base = base + cnt;
while (s) {
// spoil the suspense
u64 s3 = _pdep_u64(~0x7ULL, s); // s3 will have bottom 3 1-bits unset
u64 s5 = _pdep_u64(~0x1fULL, s); // s5 will have bottom 5 1-bits unset
base_ptr[base+0] = (u32)idx + __builtin_ctzll(s); u64 s1 = s & (s - 1ULL);
base_ptr[base+1] = (u32)idx + __builtin_ctzll(s1); u64 s2 = s1 & (s1 - 1ULL);
base_ptr[base+2] = (u32)idx + __builtin_ctzll(s2); //u64 s3 = s2 & (s2 - 1ULL);
base_ptr[base+3] = (u32)idx + __builtin_ctzll(s3); u64 s4 = s3 & (s3 - 1ULL);
base_ptr[base+4] = (u32)idx + __builtin_ctzll(s4); //u64 s5 = s4 & (s4 - 1ULL);
base_ptr[base+5] = (u32)idx + __builtin_ctzll(s5); u64 s6 = s5 & (s5 - 1ULL);
base_ptr[base+6] = (u32)idx + __builtin_ctzll(s6); u64 s7 = s6 & (s6 - 1ULL);
s = s7;
base += 7;
}
base = next_base;
#endif
}
pj.n_structural_indexes = base;
return true;
}
const u32 MAX_DEPTH = 256;
// the ape machine consists of two parts:
//
// 1) The "state machine", which is a multiple channel per-level state machine
// It is a conventional DFA except in that it 'changes track' on {}[] characters
//
// 2) The "tape machine": this records offsets of various structures as they go by
// These structures are either u32 offsets of other tapes or u32 offsets into our input
// or structures.
//
// The state machine doesn't record ouput.
// The tape machine doesn't validate.
//
// The output of the tape machine is meaningful only if the state machine is in non-error states.
// depth adjustment is strictly based on whether we are {[ or }]
// depth adjustment is a pre-increment which, in effect, means that a {[ contained in an object
// is in the level one deeper, while the corresponding }] is at the level
// TAPE MACHINE DEFINITIONS
const u32 DEPTH_PLUS_ONE = 0x2;
const u32 DEPTH_ZERO = 0x1;
const u32 DEPTH_MINUS_ONE = 0x0;
const u32 TAKE_UPTAPE = 0x80000000;
const u32 TAKE_INDEX = 0x0;
const u32 WRITE_ZERO = 0x0;
const u32 WRITE_FOUR = 0x4;
const u32 WRITE_EIGHT = 0x8;
const u32 CDEF = DEPTH_ZERO | TAKE_INDEX | WRITE_ZERO;
const u32 C0I4 = DEPTH_ZERO | TAKE_INDEX | WRITE_FOUR;
const u32 C0I8 = DEPTH_ZERO | TAKE_INDEX | WRITE_FOUR;
const u32 CPI0 = DEPTH_PLUS_ONE | TAKE_INDEX | WRITE_ZERO;
const u32 CMU8 = DEPTH_MINUS_ONE | TAKE_UPTAPE | WRITE_EIGHT;
inline s8 get_depth_adjust(u32 control) { return (s8)(control&0x3) - 1; }
inline bool is_uptape(u32 control) { return (control & TAKE_UPTAPE); }
inline size_t get_write_size(u32 control) { return control & 12; }
const u32 char_control[256] = {
// nothing interesting from 0x00-0x20
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
// " is 0x22, - is 0x2d
CDEF,CDEF,C0I4,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,C0I8,CDEF,CDEF,
// numbers are 0x30-0x39
C0I8,C0I8,C0I8,C0I8, C0I8,C0I8,C0I8,C0I8, C0I8,C0I8,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
// nothing interesting from 0x40-0x49
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
// 0x5b/5d are []
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CPI0, CDEF,CMU8,CDEF,CDEF,
// nothing interesting from 0x60-0x69
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
// 0x7b/7d are {}
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CPI0, CDEF,CMU8,CDEF,CDEF,
// nothing interesting from 0x80-0xff
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF,
CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF, CDEF,CDEF,CDEF,CDEF
};
const size_t MAX_TAPE_ENTRIES = 1024*1024;
const size_t MAX_TAPE = MAX_DEPTH * MAX_TAPE_ENTRIES;
u32 tape[MAX_TAPE];
// STATE MACHINE DECLARATIONS
const u32 MAX_STATES = 16;
u32 trans[MAX_STATES][256];
u32 disallow_exit[MAX_STATES][256];
u32 states[MAX_DEPTH];
const int START_STATE = 1;
never_inline void init_state_machine() {
trans[ 1]['{'] = 2;
trans[ 2]['"'] = 3;
trans[ 3]['"'] = 4;
trans[ 4][':'] = 5;
trans[ 5]['"'] = 6;
trans[ 6]['"'] = 7;
// 5->7 on all unary values ftn0123456789-
trans[ 7][','] = 8;
trans[ 8]['"'] = 3;
trans[ 1]['['] = 9;
trans[ 9]['"'] = 10;
trans[10]['"'] = 11;
// 9->11 on all unary values ftn0123456789-
trans[11][','] = 12;
trans[12]['"'] = 10;
// 12->11 on all unary values ftn0123456789-
const char * UNARIES = "}]ftn0123456789-";
for (u32 i = 0; i < strlen(UNARIES); i++) {
trans[ 5][(u32)UNARIES[i]] = 7;
trans[ 9][(u32)UNARIES[i]] = 11;
trans[12][(u32)UNARIES[i]] = 11;
}
// back transitions when new things are open
trans[2]['{'] = 2;
trans[7]['{'] = 2;
trans[9]['{'] = 2;
trans[11]['{'] = 2;
trans[2]['['] = 9;
trans[7]['['] = 9;
trans[9]['['] = 9;
trans[11]['['] = 9;
// note - extra-linguistic stuff in the DFA
// when we are in 2/7 we are OK to see a } at the shallower depth
// when we are in 9/11 we are OK to see a ] at the shallower depth
// nothing else should be illegal through this mechanism
for (u32 i = 0; i < MAX_STATES; i++) {
if ((i != 2) && (i != 7))
disallow_exit[i]['}'] = 1;
if ((i != 9) && (i != 11))
disallow_exit[i][']'] = 1;
}
}
never_inline bool ape_machine(const u8 * buf, UNUSED size_t len, ParsedJson & pj) {
// NOTE - our depth is used by both the tape machine and the state machine
// Further, in production we will set it to a largish value in a generous buffer as a rogue input
// could consist of many {[ characters or many }] characters. We aren't busily checking errors
// (and in fact, a aggressive sequence of [ characters is actually valid input!) so something that
// blows out maximum depth will need to be periodically checked for, as will something that tries
// to set depth very low. If we set our starting depth, say, to 256, we can tolerate 256 bogus close brace
// characters without aggressively going wrong and writing to bad memory
// Note that any specious depth can have a specious tape associated with and all these specious depths
// can share a region of the tape - it's harmless. Since tape is one-way, any movement in a specious tape
// is an error (so we can detect max_depth violations by making sure that specious tape locations haven't
// moved from their starting values)
u32 depth = 1;
u32 tape_locs[MAX_DEPTH];
for (u32 i = 0; i < MAX_DEPTH; i++) {
tape_locs[i] = i*MAX_TAPE_ENTRIES;
states[i] = START_STATE;
}
u32 i = NUM_RESERVED_NODES;
u32 nextidx;
u8 nextc;
if( i < pj.n_structural_indexes) {
nextidx = pj.structural_indexes[i];
nextc = buf[nextidx];
}
u32 error_sump = 0;
u32 old_state = 0; // experimental
for (; i + 1 < pj.n_structural_indexes; i++) {
u32 idx = nextidx;
u8 c = nextc;
nextidx = pj.structural_indexes[i + 1];
nextc = buf[nextidx];
#ifdef DEBUG
cout << "i: " << i << " idx: " << idx << " c " << c << "\n";
#endif
// TAPE MACHINE
u32 control = char_control[c];
s8 depth_adjust = get_depth_adjust(control);
bool take_uptape = is_uptape(control);
u8 write_size = get_write_size(control)/4;
depth += depth_adjust;
#ifdef DEBUG
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust << " take_uptape: " << (u32)take_uptape
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
#endif
u32 uptape = tape_locs[depth+1];
tape[tape_locs[depth]] = take_uptape ? uptape : idx;
tape_locs[depth] += write_size;
// STATE MACHINE
#ifdef DEBUG
cout << "STATE MACHINE: error_sump: " << error_sump << " old state " << old_state << " disallowed_exit[old_state][c]: " << disallow_exit[old_state][c] << "\n";
cout << "STATE MACHINE: state[depth] pre " << states[depth] << " ";
#endif
error_sump |= disallow_exit[old_state][c];
old_state = states[depth] = trans[states[depth]][c];
#ifdef DEBUG
cout << "post " << states[depth] << "\n";
#endif
}
if(i < pj.n_structural_indexes) {
u32 idx = nextidx;
u8 c = nextc;
#ifdef DEBUG
cout << "i: " << i << " idx: " << idx << " c " << c << "\n";
#endif
// TAPE MACHINE
u32 control = char_control[c];
s8 depth_adjust = get_depth_adjust(control);
bool take_uptape = is_uptape(control);
u8 write_size = get_write_size(control)/4;
depth += depth_adjust;
#ifdef DEBUG
cout << "TAPE MACHINE: depth change " << (s32)depth_adjust << " take_uptape: " << (u32)take_uptape
<< " write_size " << (u32)write_size << " current_depth: " << depth << "\n";
#endif
u32 uptape = tape_locs[depth+1];
tape[tape_locs[depth]] = take_uptape ? uptape : idx;
tape_locs[depth] += write_size;
// STATE MACHINE
#ifdef DEBUG
cout << "STATE MACHINE: error_sump: " << error_sump << " old state " << old_state << " disallowed_exit[old_state][c]: " << disallow_exit[old_state][c] << "\n";
cout << "STATE MACHINE: state[depth] pre " << states[depth] << " ";
#endif
error_sump |= disallow_exit[old_state][c];
old_state = states[depth] = trans[states[depth]][c];
#ifdef DEBUG
cout << "post " << states[depth] << "\n";
#endif
}
#ifdef DEBUG
for (u32 i = 0; i < MAX_DEPTH; i++) {
u32 start_loc = i*MAX_TAPE_ENTRIES;
cout << " tape section i " << i << " from: " << start_loc
<< " to: " << tape_locs[i] << " "
<< " size: " << (tape_locs[i]-start_loc) << "\n";
cout << " state: " << states[i] << "\n";
/*
for (u32 j = start_loc; j < tape_locs[i]; j++) {
cout << "j: " << j << " tape[j]: " << tape[j] << "\n";
}
*/
}
#endif
if (error_sump) {
return false;
}
return true;
}
// https://stackoverflow.com/questions/2616906/how-do-i-output-coloured-text-to-a-linux-terminal
namespace Color {
enum Code {
FG_DEFAULT = 39, FG_BLACK = 30, FG_RED = 31, FG_GREEN = 32,
FG_YELLOW = 33, FG_BLUE = 34, FG_MAGENTA = 35, FG_CYAN = 36,
FG_LIGHT_GRAY = 37, FG_DARK_GRAY = 90, FG_LIGHT_RED = 91,
FG_LIGHT_GREEN = 92, FG_LIGHT_YELLOW = 93, FG_LIGHT_BLUE = 94,
FG_LIGHT_MAGENTA = 95, FG_LIGHT_CYAN = 96, FG_WHITE = 97,
BG_RED = 41, BG_GREEN = 42, BG_BLUE = 44, BG_DEFAULT = 49
};
class Modifier {
Code code;
public:
Modifier(Code pCode) : code(pCode) {}
friend std::ostream&
operator<<(std::ostream& os, const Modifier& mod) {
return os << "\033[" << mod.code << "m";
}
};
}
void colorfuldisplay(ParsedJson & pj, const u8 * buf) {
Color::Modifier greenfg(Color::FG_GREEN);
Color::Modifier yellowfg(Color::FG_YELLOW);
Color::Modifier deffg(Color::FG_DEFAULT);
size_t i = 0;
// skip initial fluff
while((i+1< pj.n_structural_indexes) && (pj.structural_indexes[i]==pj.structural_indexes[i+1])){
i++;
}
for (; i < pj.n_structural_indexes; i++) {
u32 idx = pj.structural_indexes[i];
u8 c = buf[idx];
if (((c & 0xdf) == 0x5b)) { // meaning 7b or 5b, { or [
std::cout << greenfg << buf[idx] << deffg;
} else if (((c & 0xdf) == 0x5d)) { // meaning 7d or 5d, } or ]
std::cout << greenfg << buf[idx] << deffg;
} else {
std::cout << yellowfg << buf[idx] << deffg;
}
if(i + 1 < pj.n_structural_indexes) {
u32 nextidx = pj.structural_indexes[i + 1];
for(u32 pos = idx + 1 ; pos < nextidx; pos++) {
std::cout << buf[pos];
}
}
}
std::cout << std::endl;
}
int main(int argc, char * argv[]) {
if (argc != 2) {
cerr << "Usage: " << argv[0] << " <jsonfile>\n";
exit(1);
}
pair<u8 *, size_t> p = get_corpus(argv[1]);
ParsedJson pj;
if (posix_memalign( (void **)&pj.structurals, 8, ROUNDUP_N(p.second, 64)/8)) {
throw "Allocation failed";
};
init_state_machine();
pj.n_structural_indexes = 0;
// we have potentially 1 structure per byte of input
// as well as a dummy structure and a root structure
// we also potentially write up to 7 iterations beyond
// in our 'cheesy flatten', so make some worst-case
// space for that too
u32 max_structures = ROUNDUP_N(p.second, 64) + 2 + 7;
pj.structural_indexes = new u32[max_structures];
pj.nodes = new JsonNode[max_structures];
#if defined(DEBUG)
const u32 iterations = 1;
#else
const u32 iterations = 1000;
#endif
vector<double> res;
res.resize(iterations);
#ifdef __linux__
LinuxEvents<PERF_TYPE_HARDWARE> cycles(PERF_COUNT_HW_CPU_CYCLES);
LinuxEvents<PERF_TYPE_HARDWARE> instructions(PERF_COUNT_HW_INSTRUCTIONS);
unsigned long cy1 = 0, cy2 = 0, cy3 = 0;
unsigned long cl1 = 0, cl2 = 0, cl3 = 0;
#endif
for (u32 i = 0; i < iterations; i++) {
auto start = std::chrono::steady_clock::now();
#ifdef __linux__
cycles.start(); instructions.start();
#endif
find_structural_bits(p.first, p.second, pj);
#ifdef __linux__
cy1 += cycles.end(); cl1 += instructions.end();
cycles.start(); instructions.start();
#endif
flatten_indexes(p.second, pj);
#ifdef __linux__
cy2 += cycles.end(); cl2 += instructions.end();
cycles.start(); instructions.start();
#endif
ape_machine(p.first, p.second, pj);
#ifdef __linux__
cy3 += cycles.end(); cl3 += instructions.end();
#endif
auto end = std::chrono::steady_clock::now();
std::chrono::duration<double> secs = end - start;
res[i] = secs.count();
}
#ifdef __linux__
unsigned long total = cy1 + cy2 + cy3 ;
printf("stage 1 instructions: %10lu cycles: %10lu (%.1f %%) ins/cycles: %.2f \n",
cy1, cl1, 100. * cy1 / total, (double) cl1 / cy1);
printf(" stage 1 runs at %.1f cycles per input byte.\n", (double) cy1 / (iterations * p.second));
printf("stage 2 instructions: %10lu cycles: %10lu (%.1f %%) ins/cycles: %.2f \n",
cy2, cl2, 100. * cy2 / total, (double) cl2 / cy2);
printf(" stage 2 runs at %.1f cycles per input byte.\n", (double) cy2 / (iterations * p.second));
printf("stage 3 instructions: %10lu cycles: %10lu (%.1f %%) ins/cycles: %.2f \n",
cy3, cl3, 100. * cy3 / total, (double) cl3 / cy3);
printf(" stage 3 runs at %.1f cycles per input byte.\n", (double) cy3 / (iterations * p.second));
printf(" all stages: %.1f cycles per input byte.\n", (double) total / (iterations * p.second));
#endif
// colorfuldisplay(pj, p.first);
double min_result = *min_element(res.begin(), res.end());
cout << "Min: " << min_result << " bytes read: " << p.second << " Gigabytes/second: " << (p.second) / (min_result * 1000000000.0) << "\n";
return 0;
}