Branch data Line data Source code
1 : : /*****************************************************************************
2 : : * *
3 : : * Copyright (c) 2012, Intel Corporation *
4 : : * *
5 : : * All rights reserved. *
6 : : * *
7 : : * Redistribution and use in source and binary forms, with or without *
8 : : * modification, are permitted provided that the following conditions are *
9 : : * met: *
10 : : * *
11 : : * * Redistributions of source code must retain the above copyright *
12 : : * notice, this list of conditions and the following disclaimer. *
13 : : * *
14 : : * * Redistributions in binary form must reproduce the above copyright *
15 : : * notice, this list of conditions and the following disclaimer in the *
16 : : * documentation and/or other materials provided with the *
17 : : * distribution. *
18 : : * *
19 : : * * Neither the name of the Intel Corporation nor the names of its *
20 : : * contributors may be used to endorse or promote products derived from *
21 : : * this software without specific prior written permission. *
22 : : * *
23 : : * *
24 : : * THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY *
25 : : * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE *
26 : : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR *
27 : : * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR *
28 : : * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, *
29 : : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, *
30 : : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR *
31 : : * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF *
32 : : * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING *
33 : : * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS *
34 : : * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *
35 : : * *
36 : : ******************************************************************************
37 : : * Developers and authors: *
38 : : * Shay Gueron (1, 2), and Vlad Krasnov (1) *
39 : : * (1) Intel Corporation, Israel Development Center, Haifa, Israel *
40 : : * (2) University of Haifa, Israel *
41 : : *****************************************************************************/
42 : :
43 : : #include "rsaz_exp.h"
44 : :
45 : : /*
46 : : * See crypto/bn/asm/rsaz-avx2.pl for further details.
47 : : */
48 : : void rsaz_1024_norm2red_avx2(void *red,const void *norm);
49 : : void rsaz_1024_mul_avx2(void *ret,const void *a,const void *b,const void *n,BN_ULONG k);
50 : : void rsaz_1024_sqr_avx2(void *ret,const void *a,const void *n,BN_ULONG k,int cnt);
51 : : void rsaz_1024_scatter5_avx2(void *tbl,const void *val,int i);
52 : : void rsaz_1024_gather5_avx2(void *val,const void *tbl,int i);
53 : : void rsaz_1024_red2norm_avx2(void *norm,const void *red);
54 : :
55 : : #if defined(__GNUC__)
56 : : # define ALIGN64 __attribute__((aligned(64)))
57 : : #elif defined(_MSC_VER)
58 : : # define ALIGN64 __declspec(align(64))
59 : : #elif defined(__SUNPRO_C)
60 : : # define ALIGN64
61 : : # pragma align 64(one,two80)
62 : : #else
63 : : # define ALIGN64 /* not fatal, might hurt performance a little */
64 : : #endif
65 : :
66 : : ALIGN64 static const BN_ULONG one[40] =
67 : : {1,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
68 : : ALIGN64 static const BN_ULONG two80[40] =
69 : : {0,0,1<<22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0};
70 : :
71 : 0 : void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16],
72 : : const BN_ULONG base_norm[16], const BN_ULONG exponent[16],
73 : : const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0)
74 : : {
75 : : unsigned char storage[320*3+32*9*16+64]; /* 5.5KB */
76 : 0 : unsigned char *p_str = storage + (64-((size_t)storage%64));
77 : : unsigned char *a_inv, *m, *result,
78 : 0 : *table_s = p_str+320*3,
79 : 0 : *R2 = table_s; /* borrow */
80 : : int index;
81 : : int wvalue;
82 : :
83 [ # # ]: 0 : if ((((size_t)p_str&4095)+320)>>12) {
84 : 0 : result = p_str;
85 : 0 : a_inv = p_str + 320;
86 : 0 : m = p_str + 320*2; /* should not cross page */
87 : : } else {
88 : 0 : m = p_str; /* should not cross page */
89 : 0 : result = p_str + 320;
90 : 0 : a_inv = p_str + 320*2;
91 : : }
92 : :
93 : 0 : rsaz_1024_norm2red_avx2(m, m_norm);
94 : 0 : rsaz_1024_norm2red_avx2(a_inv, base_norm);
95 : 0 : rsaz_1024_norm2red_avx2(R2, RR);
96 : :
97 : 0 : rsaz_1024_mul_avx2(R2, R2, R2, m, k0);
98 : 0 : rsaz_1024_mul_avx2(R2, R2, two80, m, k0);
99 : :
100 : : /* table[0] = 1 */
101 : 0 : rsaz_1024_mul_avx2(result, R2, one, m, k0);
102 : : /* table[1] = a_inv^1 */
103 : 0 : rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0);
104 : :
105 : 0 : rsaz_1024_scatter5_avx2(table_s,result,0);
106 : 0 : rsaz_1024_scatter5_avx2(table_s,a_inv,1);
107 : :
108 : : /* table[2] = a_inv^2 */
109 : 0 : rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1);
110 : 0 : rsaz_1024_scatter5_avx2(table_s,result,2);
111 : : #if 0
112 : : /* this is almost 2x smaller and less than 1% slower */
113 : : for (index=3; index<32; index++) {
114 : : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
115 : : rsaz_1024_scatter5_avx2(table_s,result,index);
116 : : }
117 : : #else
118 : : /* table[4] = a_inv^4 */
119 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
120 : 0 : rsaz_1024_scatter5_avx2(table_s,result,4);
121 : : /* table[8] = a_inv^8 */
122 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
123 : 0 : rsaz_1024_scatter5_avx2(table_s,result,8);
124 : : /* table[16] = a_inv^16 */
125 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
126 : 0 : rsaz_1024_scatter5_avx2(table_s,result,16);
127 : : /* table[17] = a_inv^17 */
128 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
129 : 0 : rsaz_1024_scatter5_avx2(table_s,result,17);
130 : :
131 : : /* table[3] */
132 : 0 : rsaz_1024_gather5_avx2(result,table_s,2);
133 : 0 : rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
134 : 0 : rsaz_1024_scatter5_avx2(table_s,result,3);
135 : : /* table[6] */
136 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
137 : 0 : rsaz_1024_scatter5_avx2(table_s,result,6);
138 : : /* table[12] */
139 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
140 : 0 : rsaz_1024_scatter5_avx2(table_s,result,12);
141 : : /* table[24] */
142 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
143 : 0 : rsaz_1024_scatter5_avx2(table_s,result,24);
144 : : /* table[25] */
145 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
146 : 0 : rsaz_1024_scatter5_avx2(table_s,result,25);
147 : :
148 : : /* table[5] */
149 : 0 : rsaz_1024_gather5_avx2(result,table_s,4);
150 : 0 : rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
151 : 0 : rsaz_1024_scatter5_avx2(table_s,result,5);
152 : : /* table[10] */
153 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
154 : 0 : rsaz_1024_scatter5_avx2(table_s,result,10);
155 : : /* table[20] */
156 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
157 : 0 : rsaz_1024_scatter5_avx2(table_s,result,20);
158 : : /* table[21] */
159 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
160 : 0 : rsaz_1024_scatter5_avx2(table_s,result,21);
161 : :
162 : : /* table[7] */
163 : 0 : rsaz_1024_gather5_avx2(result,table_s,6);
164 : 0 : rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
165 : 0 : rsaz_1024_scatter5_avx2(table_s,result,7);
166 : : /* table[14] */
167 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
168 : 0 : rsaz_1024_scatter5_avx2(table_s,result,14);
169 : : /* table[28] */
170 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
171 : 0 : rsaz_1024_scatter5_avx2(table_s,result,28);
172 : : /* table[29] */
173 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
174 : 0 : rsaz_1024_scatter5_avx2(table_s,result,29);
175 : :
176 : : /* table[9] */
177 : 0 : rsaz_1024_gather5_avx2(result,table_s,8);
178 : 0 : rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
179 : 0 : rsaz_1024_scatter5_avx2(table_s,result,9);
180 : : /* table[18] */
181 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
182 : 0 : rsaz_1024_scatter5_avx2(table_s,result,18);
183 : : /* table[19] */
184 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
185 : 0 : rsaz_1024_scatter5_avx2(table_s,result,19);
186 : :
187 : : /* table[11] */
188 : 0 : rsaz_1024_gather5_avx2(result,table_s,10);
189 : 0 : rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
190 : 0 : rsaz_1024_scatter5_avx2(table_s,result,11);
191 : : /* table[22] */
192 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
193 : 0 : rsaz_1024_scatter5_avx2(table_s,result,22);
194 : : /* table[23] */
195 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
196 : 0 : rsaz_1024_scatter5_avx2(table_s,result,23);
197 : :
198 : : /* table[13] */
199 : 0 : rsaz_1024_gather5_avx2(result,table_s,12);
200 : 0 : rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
201 : 0 : rsaz_1024_scatter5_avx2(table_s,result,13);
202 : : /* table[26] */
203 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
204 : 0 : rsaz_1024_scatter5_avx2(table_s,result,26);
205 : : /* table[27] */
206 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
207 : 0 : rsaz_1024_scatter5_avx2(table_s,result,27);
208 : :
209 : : /* table[15] */
210 : 0 : rsaz_1024_gather5_avx2(result,table_s,14);
211 : 0 : rsaz_1024_mul_avx2(result,result,a_inv,m,k0);
212 : 0 : rsaz_1024_scatter5_avx2(table_s,result,15);
213 : : /* table[30] */
214 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 1);
215 : 0 : rsaz_1024_scatter5_avx2(table_s,result,30);
216 : : /* table[31] */
217 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
218 : 0 : rsaz_1024_scatter5_avx2(table_s,result,31);
219 : : #endif
220 : :
221 : : /* load first window */
222 : 0 : p_str = (unsigned char*)exponent;
223 : 0 : wvalue = p_str[127] >> 3;
224 : 0 : rsaz_1024_gather5_avx2(result,table_s,wvalue);
225 : :
226 : 0 : index = 1014;
227 : :
228 [ # # ]: 0 : while(index > -1) { /* loop for the remaining 127 windows */
229 : :
230 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 5);
231 : :
232 : 0 : wvalue = *((unsigned short*)&p_str[index/8]);
233 : 0 : wvalue = (wvalue>> (index%8)) & 31;
234 : 0 : index-=5;
235 : :
236 : 0 : rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */
237 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
238 : : }
239 : :
240 : : /* square four times */
241 : 0 : rsaz_1024_sqr_avx2(result, result, m, k0, 4);
242 : :
243 : 0 : wvalue = p_str[0] & 15;
244 : :
245 : 0 : rsaz_1024_gather5_avx2(a_inv,table_s,wvalue); /* borrow a_inv */
246 : 0 : rsaz_1024_mul_avx2(result, result, a_inv, m, k0);
247 : :
248 : : /* from Montgomery */
249 : 0 : rsaz_1024_mul_avx2(result, result, one, m, k0);
250 : :
251 : 0 : rsaz_1024_red2norm_avx2(result_norm, result);
252 : :
253 : 0 : OPENSSL_cleanse(storage,sizeof(storage));
254 : 0 : }
255 : :
256 : : /*
257 : : * See crypto/bn/rsaz-x86_64.pl for further details.
258 : : */
259 : : void rsaz_512_mul(void *ret,const void *a,const void *b,const void *n,BN_ULONG k);
260 : : void rsaz_512_mul_scatter4(void *ret,const void *a,const void *n,BN_ULONG k,const void *tbl,unsigned int power);
261 : : void rsaz_512_mul_gather4(void *ret,const void *a,const void *tbl,const void *n,BN_ULONG k,unsigned int power);
262 : : void rsaz_512_mul_by_one(void *ret,const void *a,const void *n,BN_ULONG k);
263 : : void rsaz_512_sqr(void *ret,const void *a,const void *n,BN_ULONG k,int cnt);
264 : : void rsaz_512_scatter4(void *tbl, const BN_ULONG *val, int power);
265 : : void rsaz_512_gather4(BN_ULONG *val, const void *tbl, int power);
266 : :
267 : 2168 : void RSAZ_512_mod_exp(BN_ULONG result[8],
268 : : const BN_ULONG base[8], const BN_ULONG exponent[8],
269 : : const BN_ULONG m[8], BN_ULONG k0, const BN_ULONG RR[8])
270 : : {
271 : : unsigned char storage[16*8*8+64*2+64]; /* 1.2KB */
272 : 2168 : unsigned char *table = storage + (64-((size_t)storage%64));
273 : 2168 : BN_ULONG *a_inv = (BN_ULONG *)(table+16*8*8),
274 : 2168 : *temp = (BN_ULONG *)(table+16*8*8+8*8);
275 : 2168 : unsigned char *p_str = (unsigned char*)exponent;
276 : : int index;
277 : : unsigned int wvalue;
278 : :
279 : : /* table[0] = 1_inv */
280 : 2168 : temp[0] = 0-m[0]; temp[1] = ~m[1];
281 : 2168 : temp[2] = ~m[2]; temp[3] = ~m[3];
282 : 2168 : temp[4] = ~m[4]; temp[5] = ~m[5];
283 : 2168 : temp[6] = ~m[6]; temp[7] = ~m[7];
284 : 2168 : rsaz_512_scatter4(table, temp, 0);
285 : :
286 : : /* table [1] = a_inv^1 */
287 : 2168 : rsaz_512_mul(a_inv, base, RR, m, k0);
288 : 2168 : rsaz_512_scatter4(table, a_inv, 1);
289 : :
290 : : /* table [2] = a_inv^2 */
291 : 2168 : rsaz_512_sqr(temp, a_inv, m, k0, 1);
292 : 2168 : rsaz_512_scatter4(table, temp, 2);
293 : :
294 [ + + ]: 30352 : for (index=3; index<16; index++)
295 : 28184 : rsaz_512_mul_scatter4(temp, a_inv, m, k0, table, index);
296 : :
297 : : /* load first window */
298 : 2168 : wvalue = p_str[63];
299 : :
300 : 2168 : rsaz_512_gather4(temp, table, wvalue>>4);
301 : 2168 : rsaz_512_sqr(temp, temp, m, k0, 4);
302 : 2168 : rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0xf);
303 : :
304 [ + + ]: 138752 : for (index=62; index>=0; index--) {
305 : 136584 : wvalue = p_str[index];
306 : :
307 : 136584 : rsaz_512_sqr(temp, temp, m, k0, 4);
308 : 136584 : rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue>>4);
309 : :
310 : 136584 : rsaz_512_sqr(temp, temp, m, k0, 4);
311 : 136584 : rsaz_512_mul_gather4(temp, temp, table, m, k0, wvalue&0x0f);
312 : : }
313 : :
314 : : /* from Montgomery */
315 : 2168 : rsaz_512_mul_by_one(result, temp, m, k0);
316 : :
317 : 2168 : OPENSSL_cleanse(storage,sizeof(storage));
318 : 2168 : }
|