#include <display_syscalls.h> #include <keyboard_syscalls.h> #include <keyboard.hpp> #include <color.h> // Getkey routine const unsigned short* keyboard_register = (unsigned short*)0xA44B0000; unsigned short lastkey[8]; unsigned short holdkey[8]; void keyupdate(void) { memcpy(holdkey, lastkey, sizeof(unsigned short)*8); memcpy(lastkey, keyboard_register, sizeof(unsigned short)*8); } int keydownlast(int basic_keycode) { int row, col, word, bit; row = basic_keycode%10; col = basic_keycode/10-1; word = row>>1; bit = col + 8*(row&1); return (0 != (lastkey[word] & 1<<bit)); } int keydownhold(int basic_keycode) { int row, col, word, bit; row = basic_keycode%10; col = basic_keycode/10-1; word = row>>1; bit = col + 8*(row&1); return (0 != (holdkey[word] & 1<<bit)); } int main() { int i=0; int key; // clear screen Bdisp_AllClr_VRAM(); while (1) { keyupdate(); // increment i i++; if(i>4600000) { if (keydownlast(KEY_PRGM_ACON)) { char buffer[10]; strcpy(buffer," "); itoa(i, buffer+2); PrintXY(1,1,buffer,0,COLOR_BLACK); Bdisp_PutDisp_DD(); } // handle [menu] if (keydownlast(KEY_PRGM_MENU)) { GetKey(&key); } } } return 1; }

[Help Needed] Calculator Benchmark
#41
Posted 13 September 2013 - 10:16 PM
- pier4r likes this
#42
Posted 13 September 2013 - 11:38 PM
CODE A=0.W GOSBVL POP# GOSBVL SAVPTR SKUB { *start !ARM STMDB sp! {R4 R5 R6 LP} LDR R2,[R1,#2316] MOV R3,3 *outer MOV R4,2 *inner MOV R5,R4 *modloop1 MOV R5,R5 LSL #1 CMP R5,R3 BLO modloop1 BEQ outer_end MOV R6,R3 *modloop2 CMP R6,R5 BEQ outer_end SUBHS R6,R6,R5 MOV R5,R5 LSR #1 CMP R5,R4 BHS modloop2 ADD R4,R4,1 CMP R4,R3 BLO inner *outer_end ADD R3,R3,1 CMP R3,R2 BLS outer LDMIA sp! {R4 R5 R6 PC} !ASM *end } C=RSTK D0=C D1=80100 LC(5)end-start MOVEDN LC 80100 ARMSAT GOVLNG GETPTRLOOP ENDCODE
- pier4r likes this
#43
Posted 14 September 2013 - 04:26 AM

#44
Posted 14 September 2013 - 07:30 AM
while (1) { // increment i i++; if(i>4600000) { keyupdate(); if (keydownlast(KEY_PRGM_ACON)) { char buffer[10]; strcpy(buffer," "); itoa(i, buffer+2); PrintXY(1,1,buffer,0,COLOR_BLACK); Bdisp_PutDisp_DD(); } // handle [menu] if (keydownlast(KEY_PRGM_MENU)) { GetKey(&key); } } }
edit: all the results added.
Edited by pier4r, 14 September 2013 - 07:52 AM.
#45
Posted 14 September 2013 - 01:44 PM
#46
Posted 14 September 2013 - 01:47 PM
I think about a sort of "don't check any i/o for a while".
#47
Posted 14 September 2013 - 02:14 PM
OK, now, not overclocked, I get 8823327 with this code:
#include <display_syscalls.h> #include <keyboard_syscalls.h> #include <keyboard.hpp> #include <color.h> // Getkey routine const unsigned short* keyboard_register = (unsigned short*)0xA44B0000; unsigned short lastkey[8]; unsigned short holdkey[8]; void keyupdate(void) { memcpy(holdkey, lastkey, sizeof(unsigned short)*8); memcpy(lastkey, keyboard_register, sizeof(unsigned short)*8); } int keydownlast(int basic_keycode) { int row, col, word, bit; row = basic_keycode%10; col = basic_keycode/10-1; word = row>>1; bit = col + 8*(row&1); return (0 != (lastkey[word] & 1<<bit)); } int keydownhold(int basic_keycode) { int row, col, word, bit; row = basic_keycode%10; col = basic_keycode/10-1; word = row>>1; bit = col + 8*(row&1); return (0 != (holdkey[word] & 1<<bit)); } int main() { int i=0; int key; // clear screen Bdisp_AllClr_VRAM(); while (1) { // increment i i++; if(i>4640000) { keyupdate(); if (keydownlast(KEY_PRGM_ACON)) { char buffer[10]; strcpy(buffer," "); itoa(i, buffer+2); PrintXY(1,1,buffer,0,COLOR_BLACK); Bdisp_PutDisp_DD(); } // handle [menu] if (keydownlast(KEY_PRGM_MENU)) { GetKey(&key); } } } return 1; }
I am going to try to change the if statement to something higher, tell you how that turns out when I finish.

#48
Posted 14 September 2013 - 03:30 PM
because with a real wait (not busy) for a key as a sort of interrupt, your code can do far better!
#49
Posted 14 September 2013 - 05:48 PM
So the primz has only a busywait? (like "while(1) { getKey } " )
because with a real wait (not busy) for a key as a sort of interrupt, your code can do far better!
Hmm... not sure what you mean, but I am working on increasing the wait until i get the fastest output.

#50
Posted 14 September 2013 - 06:27 PM
- pier4r likes this
#51
Posted 14 September 2013 - 08:39 PM
Hmm... not sure what you mean, but I am working on increasing the wait until i get the fastest output.
http://en.wikipedia....ki/Busy_waiting
Good. Given the results on ultranaiveprimes, you should get around 100M-150M i guess.
#52
Posted 15 September 2013 - 12:03 PM
:: %1 2500 ZERO_DO DUP %* %SQRT %LN %EXP %ATAN %TAN %1+ LOOP ;The loop has a 2500 and 0 as parameter (ZERO_DO provides the 0 for me) because the end is always exclusive, not inclusive like with FOR loops in Basic-like languages. So if I want to run it 2500 times, I must write 2500 0 DO (or collapse the 0 DO into ZERO_DO, which is a single command) instead of 2499 0 DO.
EDIT:
Oops, forgot to set angle mode to radians. Repeating test with correct mode now... I also noticed that most of the time the loop is run only 2499 times. Oh, and I found a solution to the missing %%ATAN in the references for the HP48 results. EDIT:
|
Edited by 3298, 15 September 2013 - 01:36 PM.
- pier4r likes this
#53
Posted 15 September 2013 - 03:53 PM
I found a better UserRPL implementation that is more than twice as fast: 10.6349 seconds for k=1. The code:
<< ALOG DUP SQ DUP 10. / OVER 1. - FOR a DUP a DO DUP SQ 5. PICK / IP 4. PICK MOD UNTIL SWAP OVER == ROT 1. - UNROT PICK3 NOT OR END DROP2 NEXT DROP2 >>I'm working on a SysRPL version, will edit when I'm ready.
EDIT:
Due to the size of the numbers this test produces I can't use normal bints if I want to run it for k=2. As a consequence, my first try was to translate my UserRPL version to SysRPL using HXS numbers. That one got k=1 solved in about 4 seconds with hardcoded constants n and n² for k=1. When I replaced them by a piece of code that was supposed to calculate them from user input, it was for some reason slowed down to about 12 seconds (yes, slower than my previous UserRPL version). I could have done a version that is slightly faster than the UserRPL version by using real numbers as well, but as the highest number calculated for k=1 is still below the limit of 1048576, I can safely use bints for that one. The result is a time of 1.1421 seconds with the following code::: BINT10 BINT100 BINT100 BINT10 DO DUPINDEX@ BEGIN DUPDUP #* 5PICK #/ SWAPDROP 4PICK #/ DROP SWAPOVER #= ROT #1- DUP4UNROLL #0= OR UNTIL 2DROP LOOP 2DROP ; |
Edited by 3298, 15 September 2013 - 05:16 PM.
- pier4r likes this
#54
Posted 15 September 2013 - 08:39 PM
edit: all added.
The middle square test takes very long time (and uses all basic math operators, but less complex than the savage one. Maybe is better, for faster implementations (1)). I have designed it for faster implementations, but it is hard even for smartphones just with k=2. Maybe i'll do an extension of the savage test (that is linear) such as: test the accuracy of the savage operation from numbers 500 to n (where n is: 1250, 2500, 5000, 10000, 20000 for example). So we can collect (if the calculator has enough memory. There can be a lot of data) a list of accuracy values to see how the calculator behave with less or more operations, plus the time to do:
- basic operations
- exponentials and log
- trigonometric
- matrix/lists - memory
(1) keeping a bit of complexity, so will be not so trivial to code a TAN function in ASM.
Edited by pier4r, 15 September 2013 - 09:11 PM.
#55
Posted 17 September 2013 - 06:40 PM
CODE GOSBVL POP# GOSBVL SAVPTR B=A.A C=0.W A=0.W C+1.W A+10.W *ALOGloop C*A.W B-1.W ?B#0.A GOYES ALOGloop R0=C.W C*C.W R1=C.W C/A.W R2=C.W B=C.W *FORloop D=C.W *WHILEloop D-1.W ?D=0.W GOYES endFORloop C*C.W A=R0.W C/A.WA=R1.W C%A.W A=B.W B=C.W ?A#C.W GOYES WHILEloop C=R2.W A=R1.W C+1.W R2=C.W ?C#A.W GOYES FORloop GOVLNG GETPTRLOOP ENDCODE... takes 0.0359 seconds for k=1 and 1445.8055 seconds for k=2. 19 times as fast as the poor Palm treo pro GSM with its 400MHz CPU.

Another note on optimization in this code: I let the limitCyclicSequences counter run backwards from n to 0, this saves an instruction or two in the WHILE loop compared to the pseudocode version.
About "Casio Addict": I did spend quite some time with Casio calcs in the past, but the 50G is sooo much better. And optimizing stuff beyond all limits is fun. (Who would have thought that skipping the ON key check most of the time in the addloop can improve performance that much?) CasioBasic is hard to optimize, partly because literally everything is so incredibly slow, partly because I can't investigate its implementation like on HP calcs.
If you give me memory-related benchmarks, the smartphones might even have a chance because the calc's memory is quite slow. Trigonometry, logarithms and such are difficult as well due to the missing FPU - you may have noticed that I didn't do the Savage test in ASM.
Edited by 3298, 17 September 2013 - 07:19 PM.
- pier4r likes this
#56
Posted 18 September 2013 - 11:08 AM
About "complex benchmarks" in terms of operations and or using memory: i know that an ASM code without math libraries is hard to code (could you adapt some libraries from the internet?) but nevertheless they give a better idea of the calculator performance using daily math functions like trig/exp ones.
Of course, if the square root is not impossible to code with ARM ASM without FPU, i guess what will be the execution time for k=2. Maybe 300 secs.
Edited by pier4r, 18 September 2013 - 11:09 AM.
#57
Posted 18 September 2013 - 02:59 PM
Square roots are usually an FPU operation, but this benchmark doesn't need it. You did calculate a sqare root in your UserRPL program, but mine already shows that it is not necessary. (You calculate n as 100^k and n_sqrt as sqrt(n); I calculate n_sqrt first as 10^k, then n as n_sqrt^2 - they are equivalent because (10^k)^2=10^(2k)=(10^2)^k=100^k).
With the HPGCC3 patch I do have some math libraries installed, but I need to find out how to call them from plain ARM ASM.
I just noticed that you wrote "@75MHz" behind the Saturn timing. Would be nice to have, but on the emulator the speed increase compared to the 49G with its real 4MHz Saturn is only 1.5x - 2x, depending on what it is doing (for example: the ARM has to do some slower software handling for the Saturn's decimal mode because it doesn't have such a mode itself). Better move it to the system specs or add an emulation note.
Edited by 3298, 18 September 2013 - 03:08 PM.
#58
Posted 18 September 2013 - 03:46 PM
Thanks for explanation.300 seconds - could be. But it could as well be 500 seconds. I didn't run the test yet.
Square roots are usually an FPU operation, but this benchmark doesn't need it. You did calculate a sqare root in your UserRPL program, but mine already shows that it is not necessary. (You calculate n as 100^k and n_sqrt as sqrt(n); I calculate n_sqrt first as 10^k, then n as n_sqrt^2 - they are equivalent because (10^k)^2=10^(2k)=(10^2)^k=100^k).
The annotation tells the "real" speed of the calculator cpu, if that has to compute even the emulation it's a OS/firmware problem. So it means: "saturn ASM code running on the 50g cpu clocked at 75mhz".I just noticed that you wrote "@75MHz" behind the Saturn timing. Would be nice to have, but on the emulator the speed increase compared to the 49G with its real 4MHz Saturn is only 1.5x - 2x, depending on what it is doing (for example: the ARM has to do some slower software handling for the Saturn's decimal mode because it doesn't have such a mode itself). Better move it to the system specs or add an emulation note.
I don't move it in the system spec because another guy could add, for the same entry: "saturn ASM code running on the 50g cpu at 203mhz", without duplicating the whole entry (system specs and so on). The better value for an entry is chosen for the ranking order.
Anyway has flyingfisch done it's optimization on the addloop test with C for prizm?
Edited by pier4r, 18 September 2013 - 03:47 PM.
#59
Posted 19 September 2013 - 08:18 AM
http://www.wiki4hp.c...savage-extended
flyingfisch can you give it a shot with your powerful prizm? Thanks!
#60
Posted 19 September 2013 - 03:55 PM
- swap B=C.W and *FORloop (corrects the error that was actually in my code)
- insert a new line containing *endFORloop after GOYES WHILEloop (corrects my PC-side typing error)
The ARM version will come soon, I just need to figure out where it's taking a shortcut (1.5 seconds for k=3 can't be right).
- pier4r likes this
#61
Posted 19 September 2013 - 05:22 PM
#62
Posted 21 September 2013 - 10:02 AM
For the code, I obviously needed division. This time, I searched for proper implementations of division on the web and found this one in the official infocenter. It is pretty scary how close I got with my own modulo routine, so it was quite optimized after all.

CMP R7,R8 SUBNES R6,R6,1 BNE WHILEloopNot many processors allow the programmer to write such a compact version of "if(R7!=R8 && (--R6)!=0) goto WHILEloop;". Okay, now the entire code:
CODE A=0.W GOSBVL POP# GOSBVL SAVPTR SKUB { *start !ARM STMDB sp! {R4 R5 R6 R7 R8 R9 R10 LP } LDR R2,[R1,#2316] MOV R3,1 MOV R10,10 *ALOGloop MUL R3,R3,R10 SUBS R2,R2,1 BNE ALOGloop MUL R2,R3,R3 MOV R7,R2 BL divmod MOV R4,R9 *FORloop MOV R5,R4 MOV R6,R4 *WHILEloop MUL R,R5,R5 MOV R10,R3 BL divmod MOV R7,R9 MOV R10,R2 BL divmod MOV R8,R5 MOV R5,R7 CMP R7,R8 SUBNES R6,R6,1 BNE WHILEloop ADD R4,R4,1 CMP R4,R2 BNE FORloop LDMIA sp! {R4 R5 R6 R7 R8 R9 R10 PC} *divmod MOV R8,R10 MOV R9,0 CMP R7,R10 MOVLO PC,LR *.LSloop MOV R8,R8 LSL #1 CMP R8,R7 BLO .LSloop *.RSloop MOVHI R8,R8 LSR #1 CMP R7,R8 SUBHS R7,R7,R8 ADC R9,R9,R9 CMP R8,R10 BHI .RSloop MOV PC,LR !ASM *end } C=RSTK D0=C D1=80100 LC(5)end-start MOVEDN LC 80100 ARMSAT GOVLNG GETPTRLOOP ENDCODE
- pier4r likes this
#63
Posted 22 September 2013 - 09:19 AM
Edited by pier4r, 22 September 2013 - 09:22 AM.
#64
Posted 29 December 2013 - 05:20 PM
I checked several different summation loop with fx-5800P and it resulted the following are the best so far.
0->S Lbl 0 Isz S Goto 0
I got S=11230 after 60sec using 'finger-sync' with PC software to measure 60sec, so the measurement is approximate. But the result is significant.
For more accurate comparison, I tried decrement command 'Dsz S' from S=1000 to 1, then I got following result.
1) Top fast
- Lbl 0 / Goto 0
- Do / LpWhile (use variable for determination of 0)
- While / WhileEnd (use variable for determination of 0)
Measurement from S=1000 to 1: 6sec
Lbl / Goto
1000->A Lbl 0 Dsz A Goto 0 "DONE" ◣
Do / LpWhile
1000->A Do Dsz A LpWhile A "DONE" ◣
While / WhileEnd
1000->A While A Dsz A WhileEnd "DONE" ◣
2) 2nd fast
- For / Next
Measurement from S=1000 to 1: 7sec
For 1000->A To 0 Step -1 Next "DONE" ◣
Edited by Krtyski, 29 December 2013 - 09:12 PM.
#65
Posted 30 December 2013 - 03:49 AM
Did you try the ultra naive primes benchmark?
#66
Posted 31 December 2013 - 10:27 AM
Did you try the ultra naive primes benchmark?
I did programmed fx-5800P for a sort of naive prime benchmark before, searching divisible number D from 2 up to N-1 for given natural number N. When D is found as divisible, the program searches how many Ds' as divisible. Then increment D and does the same search.
I should say this program is not quit equivalent to your C code, a bit more effective because of the slow fx-5800P. I've just programmed in same logic of yours using double 'For statements', but it was very slow, apparently 'For statement' of fx-5800P takes about 7msec. For index J takes 2 to N-1, index K 3 to N-1, so theoretically it takes (N-2)x(N-3)x7 [ms], the actual measurement was very close to the calculation. For N=1000 it took about 7000sec.
Anyway a bit effective naive prime resulted in as follows;
N=100: 0.8sec
N=1000: 1sec
N=10000: 1sec
N=100000: 1.1sec
N=1000000: 1.1sec
Lbl 0 Cls 20->DimZ "NAIVE PRIME 1" "INPUT INTEGER"?->N 0->C:s->D:N->X:1->K Do X/D->Y Of Frac(Y)=0 Then Y->X:Isz C Else If C:Then D->Z[K]:C->Z[K+1] Osz K:Isz K 0->C:IfEnd Isz D:IfEnd LpWhile Y≥1 Cls K-2->J:1->L For 1->K To J Step 2 If Z[K+1]=1 Then Locate 1,L,Z[K]◣ Else Locate 1,L,Z[K] 2+Int(log(Z[K]))->M Locate M,L,"^(" Locate M+1,L,Z[K+1]◣ IfEnd Isz L L=5=>1->L Next 0->DimZ Locate1,L,"<EXE> TO RETRY"◣ Goto 0
I also made more effective (fast) Prime Decomposition program and it resulted very fast, for example 987654321 takes only 33sec to get the answer 3^2 x 17^2 x 379721 with the sslowwww fx-5800P.
Edited by Krtyski, 31 December 2013 - 10:40 AM.
#67
Posted 11 March 2018 - 05:59 PM
On cemetech (ti) and hpmuseum.org (old) forum the discussion was already exhausted in September 2013. Thanks for the additions!
Edited by pier4r, 11 March 2018 - 05:59 PM.
1 user(s) are reading this topic
0 members, 1 guests, 0 anonymous users