From 50a93ce91a368e28aae20b9a1ea109ed9bdc51b4 Mon Sep 17 00:00:00 2001 From: nillerusr Date: Sun, 25 Apr 2021 23:36:09 +0300 Subject: [PATCH] build: arm target support --- common/curl/curl.h | 2119 ------- common/curl/curlbuild.h | 583 -- common/curl/curlrules.h | 252 - common/curl/curlver.h | 69 - common/curl/easy.h | 102 - common/curl/mprintf.h | 81 - common/curl/multi.h | 345 -- common/curl/stdcheaders.h | 33 - common/curl/typecheck-gcc.h | 584 -- common/curl/types.h | 1 - common/sse2neon.h | 7598 +++++++++++++++++++++++++ dedicated/wscript | 68 + dedicated_main/main.cpp | 3 + dedicated_main/wscript | 43 + engine/sys_dll2.cpp | 2 +- engine/sys_engine.cpp | 7 + engine/wscript | 226 +- filesystem/basefilesystem.cpp | 1 - inputsystem/wscript | 5 +- launcher_main/main.cpp | 5 +- materialsystem/shaderapiempty/wscript | 52 + mathlib/sse.cpp | 34 +- public/bitmap/imageformat.h | 2 + public/materialsystem/imesh.h | 25 +- public/mathlib/mathlib.h | 13 +- public/mathlib/ssemath.h | 4 +- public/mathlib/vector.h | 10 +- public/mathlib/vector4d.h | 16 +- public/saverestoretypes.h | 14 +- public/tier0/platform.h | 14 +- public/tier0/threadtools.h | 5 +- public/tier1/convar.h | 5 +- public/tier1/utlblockmemory.h | 2 + public/tier1/utlfixedmemory.h | 2 + public/tier1/utllinkedlist.h | 4 + public/tier1/utlmemory.h | 3 +- public/tier1/utlrbtree.h | 5 + public/tier1/utlvector.h | 5 + public/togl/linuxwin/glmgr.h | 6 +- public/vgui/VGUI.h | 2 + public/vstdlib/pch_vstdlib.h | 3 +- public/vstdlib/random.h | 5 +- tier0/cpu.cpp | 22 +- tier0/cpu_posix.cpp | 3 +- tier1/pathmatch.cpp | 10 + tier1/processor_detect_linux.cpp | 10 + tier1/reliabletimer.cpp | 8 + tier1/strtools.cpp | 3 +- tier1/wscript | 3 +- vstdlib/coroutine.cpp | 8 +- wscript | 171 +- 51 files changed, 8210 insertions(+), 4386 deletions(-) delete mode 100644 common/curl/curl.h delete mode 100644 common/curl/curlbuild.h delete mode 100644 common/curl/curlrules.h delete mode 100644 common/curl/curlver.h delete mode 100644 common/curl/easy.h delete mode 100644 common/curl/mprintf.h delete mode 100644 common/curl/multi.h delete mode 100644 common/curl/stdcheaders.h delete mode 100644 common/curl/typecheck-gcc.h delete mode 100644 common/curl/types.h create mode 100644 common/sse2neon.h create mode 100755 dedicated/wscript create mode 100755 dedicated_main/wscript create mode 100755 materialsystem/shaderapiempty/wscript diff --git a/common/curl/curl.h b/common/curl/curl.h deleted file mode 100644 index cb9d0fbf..00000000 --- a/common/curl/curl.h +++ /dev/null @@ -1,2119 +0,0 @@ -#ifndef __CURL_CURL_H -#define __CURL_CURL_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* - * If you have libcurl problems, all docs and details are found here: - * http://curl.haxx.se/libcurl/ - * - * curl-library mailing list subscription and unsubscription web interface: - * http://cool.haxx.se/mailman/listinfo/curl-library/ - */ - -#include "curlver.h" /* libcurl version defines */ -#include "curlbuild.h" /* libcurl build definitions */ -#include "curlrules.h" /* libcurl rules enforcement */ - -/* - * Define WIN32 when build target is Win32 API - */ - -#if (defined(_WIN32) || defined(__WIN32__)) && \ - !defined(WIN32) && !defined(__SYMBIAN32__) -#define WIN32 -#endif - -#include -#include - -#if defined(__FreeBSD__) && (__FreeBSD__ >= 2) -/* Needed for __FreeBSD_version symbol definition */ -#include -#endif - -/* The include stuff here below is mainly for time_t! */ -#include -#include - -#if defined(WIN32) && !defined(_WIN32_WCE) && !defined(__GNUC__) && \ - !defined(__CYGWIN__) || defined(__MINGW32__) -#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H)) -/* The check above prevents the winsock2 inclusion if winsock.h already was - included, since they can't co-exist without problems */ -#include -#include -#endif -#else - -/* HP-UX systems version 9, 10 and 11 lack sys/select.h and so does oldish - libc5-based Linux systems. Only include it on system that are known to - require it! */ -#if defined(_AIX) || defined(__NOVELL_LIBC__) || defined(__NetBSD__) || \ - defined(__minix) || defined(__SYMBIAN32__) || defined(__INTEGRITY) || \ - defined(ANDROID) || \ - (defined(__FreeBSD_version) && (__FreeBSD_version < 800000)) -#include -#endif - -#ifndef _WIN32_WCE -#include -#endif -#if !defined(WIN32) && !defined(__WATCOMC__) && !defined(__VXWORKS__) -#include -#endif -#include -#endif - -#ifdef __BEOS__ -#include -#endif - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void CURL; - -/* - * Decorate exportable functions for Win32 and Symbian OS DLL linking. - * This avoids using a .def file for building libcurl.dll. - */ -#if (defined(WIN32) || defined(_WIN32) || defined(__SYMBIAN32__)) && \ - !defined(CURL_STATICLIB) -#if defined(BUILDING_LIBCURL) -#define CURL_EXTERN __declspec(dllexport) -#else -#define CURL_EXTERN __declspec(dllimport) -#endif -#else - -#ifdef CURL_HIDDEN_SYMBOLS -/* - * This definition is used to make external definitions visible in the - * shared library when symbols are hidden by default. It makes no - * difference when compiling applications whether this is set or not, - * only when compiling the library. - */ -#define CURL_EXTERN CURL_EXTERN_SYMBOL -#else -#define CURL_EXTERN -#endif -#endif - -#ifndef curl_socket_typedef -/* socket typedef */ -#ifdef WIN32 -typedef SOCKET curl_socket_t; -#define CURL_SOCKET_BAD INVALID_SOCKET -#else -typedef int curl_socket_t; -#define CURL_SOCKET_BAD -1 -#endif -#define curl_socket_typedef -#endif /* curl_socket_typedef */ - -struct curl_httppost { - struct curl_httppost *next; /* next entry in the list */ - char *name; /* pointer to allocated name */ - long namelength; /* length of name length */ - char *contents; /* pointer to allocated data contents */ - long contentslength; /* length of contents field */ - char *buffer; /* pointer to allocated buffer contents */ - long bufferlength; /* length of buffer field */ - char *contenttype; /* Content-Type */ - struct curl_slist* contentheader; /* list of extra headers for this form */ - struct curl_httppost *more; /* if one field name has more than one - file, this link should link to following - files */ - long flags; /* as defined below */ -#define HTTPPOST_FILENAME (1<<0) /* specified content is a file name */ -#define HTTPPOST_READFILE (1<<1) /* specified content is a file name */ -#define HTTPPOST_PTRNAME (1<<2) /* name is only stored pointer - do not free in formfree */ -#define HTTPPOST_PTRCONTENTS (1<<3) /* contents is only stored pointer - do not free in formfree */ -#define HTTPPOST_BUFFER (1<<4) /* upload file from buffer */ -#define HTTPPOST_PTRBUFFER (1<<5) /* upload file from pointer contents */ -#define HTTPPOST_CALLBACK (1<<6) /* upload file contents by using the - regular read callback to get the data - and pass the given pointer as custom - pointer */ - - char *showfilename; /* The file name to show. If not set, the - actual file name will be used (if this - is a file part) */ - void *userp; /* custom pointer used for - HTTPPOST_CALLBACK posts */ -}; - -typedef int (*curl_progress_callback)(void *clientp, - double dltotal, - double dlnow, - double ultotal, - double ulnow); - -#ifndef CURL_MAX_WRITE_SIZE - /* Tests have proven that 20K is a very bad buffer size for uploads on - Windows, while 16K for some odd reason performed a lot better. - We do the ifndef check to allow this value to easier be changed at build - time for those who feel adventurous. The practical minimum is about - 400 bytes since libcurl uses a buffer of this size as a scratch area - (unrelated to network send operations). */ -#define CURL_MAX_WRITE_SIZE 16384 -#endif - -#ifndef CURL_MAX_HTTP_HEADER -/* The only reason to have a max limit for this is to avoid the risk of a bad - server feeding libcurl with a never-ending header that will cause reallocs - infinitely */ -#define CURL_MAX_HTTP_HEADER (100*1024) -#endif - - -/* This is a magic return code for the write callback that, when returned, - will signal libcurl to pause receiving on the current transfer. */ -#define CURL_WRITEFUNC_PAUSE 0x10000001 -typedef size_t (*curl_write_callback)(char *buffer, - size_t size, - size_t nitems, - void *outstream); - - - -/* enumeration of file types */ -typedef enum { - CURLFILETYPE_FILE = 0, - CURLFILETYPE_DIRECTORY, - CURLFILETYPE_SYMLINK, - CURLFILETYPE_DEVICE_BLOCK, - CURLFILETYPE_DEVICE_CHAR, - CURLFILETYPE_NAMEDPIPE, - CURLFILETYPE_SOCKET, - CURLFILETYPE_DOOR, /* is possible only on Sun Solaris now */ - - CURLFILETYPE_UNKNOWN /* should never occur */ -} curlfiletype; - -#define CURLFINFOFLAG_KNOWN_FILENAME (1<<0) -#define CURLFINFOFLAG_KNOWN_FILETYPE (1<<1) -#define CURLFINFOFLAG_KNOWN_TIME (1<<2) -#define CURLFINFOFLAG_KNOWN_PERM (1<<3) -#define CURLFINFOFLAG_KNOWN_UID (1<<4) -#define CURLFINFOFLAG_KNOWN_GID (1<<5) -#define CURLFINFOFLAG_KNOWN_SIZE (1<<6) -#define CURLFINFOFLAG_KNOWN_HLINKCOUNT (1<<7) - -/* Content of this structure depends on information which is known and is - achievable (e.g. by FTP LIST parsing). Please see the url_easy_setopt(3) man - page for callbacks returning this structure -- some fields are mandatory, - some others are optional. The FLAG field has special meaning. */ -struct curl_fileinfo { - char *filename; - curlfiletype filetype; - time_t time; - unsigned int perm; - int uid; - int gid; - curl_off_t size; - long int hardlinks; - - struct { - /* If some of these fields is not NULL, it is a pointer to b_data. */ - char *time; - char *perm; - char *user; - char *group; - char *target; /* pointer to the target filename of a symlink */ - } strings; - - unsigned int flags; - - /* used internally */ - char * b_data; - size_t b_size; - size_t b_used; -}; - -/* return codes for CURLOPT_CHUNK_BGN_FUNCTION */ -#define CURL_CHUNK_BGN_FUNC_OK 0 -#define CURL_CHUNK_BGN_FUNC_FAIL 1 /* tell the lib to end the task */ -#define CURL_CHUNK_BGN_FUNC_SKIP 2 /* skip this chunk over */ - -/* if splitting of data transfer is enabled, this callback is called before - download of an individual chunk started. Note that parameter "remains" works - only for FTP wildcard downloading (for now), otherwise is not used */ -typedef long (*curl_chunk_bgn_callback)(const void *transfer_info, - void *ptr, - int remains); - -/* return codes for CURLOPT_CHUNK_END_FUNCTION */ -#define CURL_CHUNK_END_FUNC_OK 0 -#define CURL_CHUNK_END_FUNC_FAIL 1 /* tell the lib to end the task */ - -/* If splitting of data transfer is enabled this callback is called after - download of an individual chunk finished. - Note! After this callback was set then it have to be called FOR ALL chunks. - Even if downloading of this chunk was skipped in CHUNK_BGN_FUNC. - This is the reason why we don't need "transfer_info" parameter in this - callback and we are not interested in "remains" parameter too. */ -typedef long (*curl_chunk_end_callback)(void *ptr); - -/* return codes for FNMATCHFUNCTION */ -#define CURL_FNMATCHFUNC_MATCH 0 /* string corresponds to the pattern */ -#define CURL_FNMATCHFUNC_NOMATCH 1 /* pattern doesn't match the string */ -#define CURL_FNMATCHFUNC_FAIL 2 /* an error occurred */ - -/* callback type for wildcard downloading pattern matching. If the - string matches the pattern, return CURL_FNMATCHFUNC_MATCH value, etc. */ -typedef int (*curl_fnmatch_callback)(void *ptr, - const char *pattern, - const char *string); - -/* These are the return codes for the seek callbacks */ -#define CURL_SEEKFUNC_OK 0 -#define CURL_SEEKFUNC_FAIL 1 /* fail the entire transfer */ -#define CURL_SEEKFUNC_CANTSEEK 2 /* tell libcurl seeking can't be done, so - libcurl might try other means instead */ -typedef int (*curl_seek_callback)(void *instream, - curl_off_t offset, - int origin); /* 'whence' */ - -/* This is a return code for the read callback that, when returned, will - signal libcurl to immediately abort the current transfer. */ -#define CURL_READFUNC_ABORT 0x10000000 -/* This is a return code for the read callback that, when returned, will - signal libcurl to pause sending data on the current transfer. */ -#define CURL_READFUNC_PAUSE 0x10000001 - -typedef size_t (*curl_read_callback)(char *buffer, - size_t size, - size_t nitems, - void *instream); - -typedef enum { - CURLSOCKTYPE_IPCXN, /* socket created for a specific IP connection */ - CURLSOCKTYPE_LAST /* never use */ -} curlsocktype; - -typedef int (*curl_sockopt_callback)(void *clientp, - curl_socket_t curlfd, - curlsocktype purpose); - -struct curl_sockaddr { - int family; - int socktype; - int protocol; - unsigned int addrlen; /* addrlen was a socklen_t type before 7.18.0 but it - turned really ugly and painful on the systems that - lack this type */ - struct sockaddr addr; -}; - -typedef curl_socket_t -(*curl_opensocket_callback)(void *clientp, - curlsocktype purpose, - struct curl_sockaddr *address); - -typedef enum { - CURLIOE_OK, /* I/O operation successful */ - CURLIOE_UNKNOWNCMD, /* command was unknown to callback */ - CURLIOE_FAILRESTART, /* failed to restart the read */ - CURLIOE_LAST /* never use */ -} curlioerr; - -typedef enum { - CURLIOCMD_NOP, /* no operation */ - CURLIOCMD_RESTARTREAD, /* restart the read stream from start */ - CURLIOCMD_LAST /* never use */ -} curliocmd; - -typedef curlioerr (*curl_ioctl_callback)(CURL *handle, - int cmd, - void *clientp); - -/* - * The following typedef's are signatures of malloc, free, realloc, strdup and - * calloc respectively. Function pointers of these types can be passed to the - * curl_global_init_mem() function to set user defined memory management - * callback routines. - */ -typedef void *(*curl_malloc_callback)(size_t size); -typedef void (*curl_free_callback)(void *ptr); -typedef void *(*curl_realloc_callback)(void *ptr, size_t size); -typedef char *(*curl_strdup_callback)(const char *str); -typedef void *(*curl_calloc_callback)(size_t nmemb, size_t size); - -/* the kind of data that is passed to information_callback*/ -typedef enum { - CURLINFO_TEXT = 0, - CURLINFO_HEADER_IN, /* 1 */ - CURLINFO_HEADER_OUT, /* 2 */ - CURLINFO_DATA_IN, /* 3 */ - CURLINFO_DATA_OUT, /* 4 */ - CURLINFO_SSL_DATA_IN, /* 5 */ - CURLINFO_SSL_DATA_OUT, /* 6 */ - CURLINFO_END -} curl_infotype; - -typedef int (*curl_debug_callback) - (CURL *handle, /* the handle/transfer this concerns */ - curl_infotype type, /* what kind of data */ - char *data, /* points to the data */ - size_t size, /* size of the data pointed to */ - void *userptr); /* whatever the user please */ - -/* All possible error codes from all sorts of curl functions. Future versions - may return other values, stay prepared. - - Always add new return codes last. Never *EVER* remove any. The return - codes must remain the same! - */ - -typedef enum { - CURLE_OK = 0, - CURLE_UNSUPPORTED_PROTOCOL, /* 1 */ - CURLE_FAILED_INIT, /* 2 */ - CURLE_URL_MALFORMAT, /* 3 */ - CURLE_OBSOLETE4, /* 4 - NOT USED */ - CURLE_COULDNT_RESOLVE_PROXY, /* 5 */ - CURLE_COULDNT_RESOLVE_HOST, /* 6 */ - CURLE_COULDNT_CONNECT, /* 7 */ - CURLE_FTP_WEIRD_SERVER_REPLY, /* 8 */ - CURLE_REMOTE_ACCESS_DENIED, /* 9 a service was denied by the server - due to lack of access - when login fails - this is not returned. */ - CURLE_OBSOLETE10, /* 10 - NOT USED */ - CURLE_FTP_WEIRD_PASS_REPLY, /* 11 */ - CURLE_OBSOLETE12, /* 12 - NOT USED */ - CURLE_FTP_WEIRD_PASV_REPLY, /* 13 */ - CURLE_FTP_WEIRD_227_FORMAT, /* 14 */ - CURLE_FTP_CANT_GET_HOST, /* 15 */ - CURLE_OBSOLETE16, /* 16 - NOT USED */ - CURLE_FTP_COULDNT_SET_TYPE, /* 17 */ - CURLE_PARTIAL_FILE, /* 18 */ - CURLE_FTP_COULDNT_RETR_FILE, /* 19 */ - CURLE_OBSOLETE20, /* 20 - NOT USED */ - CURLE_QUOTE_ERROR, /* 21 - quote command failure */ - CURLE_HTTP_RETURNED_ERROR, /* 22 */ - CURLE_WRITE_ERROR, /* 23 */ - CURLE_OBSOLETE24, /* 24 - NOT USED */ - CURLE_UPLOAD_FAILED, /* 25 - failed upload "command" */ - CURLE_READ_ERROR, /* 26 - couldn't open/read from file */ - CURLE_OUT_OF_MEMORY, /* 27 */ - /* Note: CURLE_OUT_OF_MEMORY may sometimes indicate a conversion error - instead of a memory allocation error if CURL_DOES_CONVERSIONS - is defined - */ - CURLE_OPERATION_TIMEDOUT, /* 28 - the timeout time was reached */ - CURLE_OBSOLETE29, /* 29 - NOT USED */ - CURLE_FTP_PORT_FAILED, /* 30 - FTP PORT operation failed */ - CURLE_FTP_COULDNT_USE_REST, /* 31 - the REST command failed */ - CURLE_OBSOLETE32, /* 32 - NOT USED */ - CURLE_RANGE_ERROR, /* 33 - RANGE "command" didn't work */ - CURLE_HTTP_POST_ERROR, /* 34 */ - CURLE_SSL_CONNECT_ERROR, /* 35 - wrong when connecting with SSL */ - CURLE_BAD_DOWNLOAD_RESUME, /* 36 - couldn't resume download */ - CURLE_FILE_COULDNT_READ_FILE, /* 37 */ - CURLE_LDAP_CANNOT_BIND, /* 38 */ - CURLE_LDAP_SEARCH_FAILED, /* 39 */ - CURLE_OBSOLETE40, /* 40 - NOT USED */ - CURLE_FUNCTION_NOT_FOUND, /* 41 */ - CURLE_ABORTED_BY_CALLBACK, /* 42 */ - CURLE_BAD_FUNCTION_ARGUMENT, /* 43 */ - CURLE_OBSOLETE44, /* 44 - NOT USED */ - CURLE_INTERFACE_FAILED, /* 45 - CURLOPT_INTERFACE failed */ - CURLE_OBSOLETE46, /* 46 - NOT USED */ - CURLE_TOO_MANY_REDIRECTS , /* 47 - catch endless re-direct loops */ - CURLE_UNKNOWN_TELNET_OPTION, /* 48 - User specified an unknown option */ - CURLE_TELNET_OPTION_SYNTAX , /* 49 - Malformed telnet option */ - CURLE_OBSOLETE50, /* 50 - NOT USED */ - CURLE_PEER_FAILED_VERIFICATION, /* 51 - peer's certificate or fingerprint - wasn't verified fine */ - CURLE_GOT_NOTHING, /* 52 - when this is a specific error */ - CURLE_SSL_ENGINE_NOTFOUND, /* 53 - SSL crypto engine not found */ - CURLE_SSL_ENGINE_SETFAILED, /* 54 - can not set SSL crypto engine as - default */ - CURLE_SEND_ERROR, /* 55 - failed sending network data */ - CURLE_RECV_ERROR, /* 56 - failure in receiving network data */ - CURLE_OBSOLETE57, /* 57 - NOT IN USE */ - CURLE_SSL_CERTPROBLEM, /* 58 - problem with the local certificate */ - CURLE_SSL_CIPHER, /* 59 - couldn't use specified cipher */ - CURLE_SSL_CACERT, /* 60 - problem with the CA cert (path?) */ - CURLE_BAD_CONTENT_ENCODING, /* 61 - Unrecognized transfer encoding */ - CURLE_LDAP_INVALID_URL, /* 62 - Invalid LDAP URL */ - CURLE_FILESIZE_EXCEEDED, /* 63 - Maximum file size exceeded */ - CURLE_USE_SSL_FAILED, /* 64 - Requested FTP SSL level failed */ - CURLE_SEND_FAIL_REWIND, /* 65 - Sending the data requires a rewind - that failed */ - CURLE_SSL_ENGINE_INITFAILED, /* 66 - failed to initialise ENGINE */ - CURLE_LOGIN_DENIED, /* 67 - user, password or similar was not - accepted and we failed to login */ - CURLE_TFTP_NOTFOUND, /* 68 - file not found on server */ - CURLE_TFTP_PERM, /* 69 - permission problem on server */ - CURLE_REMOTE_DISK_FULL, /* 70 - out of disk space on server */ - CURLE_TFTP_ILLEGAL, /* 71 - Illegal TFTP operation */ - CURLE_TFTP_UNKNOWNID, /* 72 - Unknown transfer ID */ - CURLE_REMOTE_FILE_EXISTS, /* 73 - File already exists */ - CURLE_TFTP_NOSUCHUSER, /* 74 - No such user */ - CURLE_CONV_FAILED, /* 75 - conversion failed */ - CURLE_CONV_REQD, /* 76 - caller must register conversion - callbacks using curl_easy_setopt options - CURLOPT_CONV_FROM_NETWORK_FUNCTION, - CURLOPT_CONV_TO_NETWORK_FUNCTION, and - CURLOPT_CONV_FROM_UTF8_FUNCTION */ - CURLE_SSL_CACERT_BADFILE, /* 77 - could not load CACERT file, missing - or wrong format */ - CURLE_REMOTE_FILE_NOT_FOUND, /* 78 - remote file not found */ - CURLE_SSH, /* 79 - error from the SSH layer, somewhat - generic so the error message will be of - interest when this has happened */ - - CURLE_SSL_SHUTDOWN_FAILED, /* 80 - Failed to shut down the SSL - connection */ - CURLE_AGAIN, /* 81 - socket is not ready for send/recv, - wait till it's ready and try again (Added - in 7.18.2) */ - CURLE_SSL_CRL_BADFILE, /* 82 - could not load CRL file, missing or - wrong format (Added in 7.19.0) */ - CURLE_SSL_ISSUER_ERROR, /* 83 - Issuer check failed. (Added in - 7.19.0) */ - CURLE_FTP_PRET_FAILED, /* 84 - a PRET command failed */ - CURLE_RTSP_CSEQ_ERROR, /* 85 - mismatch of RTSP CSeq numbers */ - CURLE_RTSP_SESSION_ERROR, /* 86 - mismatch of RTSP Session Identifiers */ - CURLE_FTP_BAD_FILE_LIST, /* 87 - unable to parse FTP file list */ - CURLE_CHUNK_FAILED, /* 88 - chunk callback reported error */ - - CURL_LAST /* never use! */ -} CURLcode; - -#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all - the obsolete stuff removed! */ - -/* Backwards compatibility with older names */ - -/* The following were added in 7.17.1 */ -/* These are scheduled to disappear by 2009 */ -#define CURLE_SSL_PEER_CERTIFICATE CURLE_PEER_FAILED_VERIFICATION - -/* The following were added in 7.17.0 */ -/* These are scheduled to disappear by 2009 */ -#define CURLE_OBSOLETE CURLE_OBSOLETE50 /* noone should be using this! */ -#define CURLE_BAD_PASSWORD_ENTERED CURLE_OBSOLETE46 -#define CURLE_BAD_CALLING_ORDER CURLE_OBSOLETE44 -#define CURLE_FTP_USER_PASSWORD_INCORRECT CURLE_OBSOLETE10 -#define CURLE_FTP_CANT_RECONNECT CURLE_OBSOLETE16 -#define CURLE_FTP_COULDNT_GET_SIZE CURLE_OBSOLETE32 -#define CURLE_FTP_COULDNT_SET_ASCII CURLE_OBSOLETE29 -#define CURLE_FTP_WEIRD_USER_REPLY CURLE_OBSOLETE12 -#define CURLE_FTP_WRITE_ERROR CURLE_OBSOLETE20 -#define CURLE_LIBRARY_NOT_FOUND CURLE_OBSOLETE40 -#define CURLE_MALFORMAT_USER CURLE_OBSOLETE24 -#define CURLE_SHARE_IN_USE CURLE_OBSOLETE57 -#define CURLE_URL_MALFORMAT_USER CURLE_OBSOLETE4 - -#define CURLE_FTP_ACCESS_DENIED CURLE_REMOTE_ACCESS_DENIED -#define CURLE_FTP_COULDNT_SET_BINARY CURLE_FTP_COULDNT_SET_TYPE -#define CURLE_FTP_QUOTE_ERROR CURLE_QUOTE_ERROR -#define CURLE_TFTP_DISKFULL CURLE_REMOTE_DISK_FULL -#define CURLE_TFTP_EXISTS CURLE_REMOTE_FILE_EXISTS -#define CURLE_HTTP_RANGE_ERROR CURLE_RANGE_ERROR -#define CURLE_FTP_SSL_FAILED CURLE_USE_SSL_FAILED - -/* The following were added earlier */ - -#define CURLE_OPERATION_TIMEOUTED CURLE_OPERATION_TIMEDOUT - -#define CURLE_HTTP_NOT_FOUND CURLE_HTTP_RETURNED_ERROR -#define CURLE_HTTP_PORT_FAILED CURLE_INTERFACE_FAILED -#define CURLE_FTP_COULDNT_STOR_FILE CURLE_UPLOAD_FAILED - -#define CURLE_FTP_PARTIAL_FILE CURLE_PARTIAL_FILE -#define CURLE_FTP_BAD_DOWNLOAD_RESUME CURLE_BAD_DOWNLOAD_RESUME - -/* This was the error code 50 in 7.7.3 and a few earlier versions, this - is no longer used by libcurl but is instead #defined here only to not - make programs break */ -#define CURLE_ALREADY_COMPLETE 99999 - -#endif /*!CURL_NO_OLDIES*/ - -/* This prototype applies to all conversion callbacks */ -typedef CURLcode (*curl_conv_callback)(char *buffer, size_t length); - -typedef CURLcode (*curl_ssl_ctx_callback)(CURL *curl, /* easy handle */ - void *ssl_ctx, /* actually an - OpenSSL SSL_CTX */ - void *userptr); - -typedef enum { - CURLPROXY_HTTP = 0, /* added in 7.10, new in 7.19.4 default is to use - CONNECT HTTP/1.1 */ - CURLPROXY_HTTP_1_0 = 1, /* added in 7.19.4, force to use CONNECT - HTTP/1.0 */ - CURLPROXY_SOCKS4 = 4, /* support added in 7.15.2, enum existed already - in 7.10 */ - CURLPROXY_SOCKS5 = 5, /* added in 7.10 */ - CURLPROXY_SOCKS4A = 6, /* added in 7.18.0 */ - CURLPROXY_SOCKS5_HOSTNAME = 7 /* Use the SOCKS5 protocol but pass along the - host name rather than the IP address. added - in 7.18.0 */ -} curl_proxytype; /* this enum was added in 7.10 */ - -#define CURLAUTH_NONE 0 /* nothing */ -#define CURLAUTH_BASIC (1<<0) /* Basic (default) */ -#define CURLAUTH_DIGEST (1<<1) /* Digest */ -#define CURLAUTH_GSSNEGOTIATE (1<<2) /* GSS-Negotiate */ -#define CURLAUTH_NTLM (1<<3) /* NTLM */ -#define CURLAUTH_DIGEST_IE (1<<4) /* Digest with IE flavour */ -#define CURLAUTH_ANY (~CURLAUTH_DIGEST_IE) /* all fine types set */ -#define CURLAUTH_ANYSAFE (~(CURLAUTH_BASIC|CURLAUTH_DIGEST_IE)) - -#define CURLSSH_AUTH_ANY ~0 /* all types supported by the server */ -#define CURLSSH_AUTH_NONE 0 /* none allowed, silly but complete */ -#define CURLSSH_AUTH_PUBLICKEY (1<<0) /* public/private key files */ -#define CURLSSH_AUTH_PASSWORD (1<<1) /* password */ -#define CURLSSH_AUTH_HOST (1<<2) /* host key files */ -#define CURLSSH_AUTH_KEYBOARD (1<<3) /* keyboard interactive */ -#define CURLSSH_AUTH_DEFAULT CURLSSH_AUTH_ANY - -#define CURL_ERROR_SIZE 256 - -struct curl_khkey { - const char *key; /* points to a zero-terminated string encoded with base64 - if len is zero, otherwise to the "raw" data */ - size_t len; - enum type { - CURLKHTYPE_UNKNOWN, - CURLKHTYPE_RSA1, - CURLKHTYPE_RSA, - CURLKHTYPE_DSS - } keytype; -}; - -/* this is the set of return values expected from the curl_sshkeycallback - callback */ -enum curl_khstat { - CURLKHSTAT_FINE_ADD_TO_FILE, - CURLKHSTAT_FINE, - CURLKHSTAT_REJECT, /* reject the connection, return an error */ - CURLKHSTAT_DEFER, /* do not accept it, but we can't answer right now so - this causes a CURLE_DEFER error but otherwise the - connection will be left intact etc */ - CURLKHSTAT_LAST /* not for use, only a marker for last-in-list */ -}; - -/* this is the set of status codes pass in to the callback */ -enum curl_khmatch { - CURLKHMATCH_OK, /* match */ - CURLKHMATCH_MISMATCH, /* host found, key mismatch! */ - CURLKHMATCH_MISSING, /* no matching host/key found */ - CURLKHMATCH_LAST /* not for use, only a marker for last-in-list */ -}; - -typedef int - (*curl_sshkeycallback) (CURL *easy, /* easy handle */ - const struct curl_khkey *knownkey, /* known */ - const struct curl_khkey *foundkey, /* found */ - enum curl_khmatch, /* libcurl's view on the keys */ - void *clientp); /* custom pointer passed from app */ - -/* parameter for the CURLOPT_USE_SSL option */ -typedef enum { - CURLUSESSL_NONE, /* do not attempt to use SSL */ - CURLUSESSL_TRY, /* try using SSL, proceed anyway otherwise */ - CURLUSESSL_CONTROL, /* SSL for the control connection or fail */ - CURLUSESSL_ALL, /* SSL for all communication or fail */ - CURLUSESSL_LAST /* not an option, never use */ -} curl_usessl; - -#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all - the obsolete stuff removed! */ - -/* Backwards compatibility with older names */ -/* These are scheduled to disappear by 2009 */ - -#define CURLFTPSSL_NONE CURLUSESSL_NONE -#define CURLFTPSSL_TRY CURLUSESSL_TRY -#define CURLFTPSSL_CONTROL CURLUSESSL_CONTROL -#define CURLFTPSSL_ALL CURLUSESSL_ALL -#define CURLFTPSSL_LAST CURLUSESSL_LAST -#define curl_ftpssl curl_usessl -#endif /*!CURL_NO_OLDIES*/ - -/* parameter for the CURLOPT_FTP_SSL_CCC option */ -typedef enum { - CURLFTPSSL_CCC_NONE, /* do not send CCC */ - CURLFTPSSL_CCC_PASSIVE, /* Let the server initiate the shutdown */ - CURLFTPSSL_CCC_ACTIVE, /* Initiate the shutdown */ - CURLFTPSSL_CCC_LAST /* not an option, never use */ -} curl_ftpccc; - -/* parameter for the CURLOPT_FTPSSLAUTH option */ -typedef enum { - CURLFTPAUTH_DEFAULT, /* let libcurl decide */ - CURLFTPAUTH_SSL, /* use "AUTH SSL" */ - CURLFTPAUTH_TLS, /* use "AUTH TLS" */ - CURLFTPAUTH_LAST /* not an option, never use */ -} curl_ftpauth; - -/* parameter for the CURLOPT_FTP_CREATE_MISSING_DIRS option */ -typedef enum { - CURLFTP_CREATE_DIR_NONE, /* do NOT create missing dirs! */ - CURLFTP_CREATE_DIR, /* (FTP/SFTP) if CWD fails, try MKD and then CWD - again if MKD succeeded, for SFTP this does - similar magic */ - CURLFTP_CREATE_DIR_RETRY, /* (FTP only) if CWD fails, try MKD and then CWD - again even if MKD failed! */ - CURLFTP_CREATE_DIR_LAST /* not an option, never use */ -} curl_ftpcreatedir; - -/* parameter for the CURLOPT_FTP_FILEMETHOD option */ -typedef enum { - CURLFTPMETHOD_DEFAULT, /* let libcurl pick */ - CURLFTPMETHOD_MULTICWD, /* single CWD operation for each path part */ - CURLFTPMETHOD_NOCWD, /* no CWD at all */ - CURLFTPMETHOD_SINGLECWD, /* one CWD to full dir, then work on file */ - CURLFTPMETHOD_LAST /* not an option, never use */ -} curl_ftpmethod; - -/* CURLPROTO_ defines are for the CURLOPT_*PROTOCOLS options */ -#define CURLPROTO_HTTP (1<<0) -#define CURLPROTO_HTTPS (1<<1) -#define CURLPROTO_FTP (1<<2) -#define CURLPROTO_FTPS (1<<3) -#define CURLPROTO_SCP (1<<4) -#define CURLPROTO_SFTP (1<<5) -#define CURLPROTO_TELNET (1<<6) -#define CURLPROTO_LDAP (1<<7) -#define CURLPROTO_LDAPS (1<<8) -#define CURLPROTO_DICT (1<<9) -#define CURLPROTO_FILE (1<<10) -#define CURLPROTO_TFTP (1<<11) -#define CURLPROTO_IMAP (1<<12) -#define CURLPROTO_IMAPS (1<<13) -#define CURLPROTO_POP3 (1<<14) -#define CURLPROTO_POP3S (1<<15) -#define CURLPROTO_SMTP (1<<16) -#define CURLPROTO_SMTPS (1<<17) -#define CURLPROTO_RTSP (1<<18) -#define CURLPROTO_RTMP (1<<19) -#define CURLPROTO_RTMPT (1<<20) -#define CURLPROTO_RTMPE (1<<21) -#define CURLPROTO_RTMPTE (1<<22) -#define CURLPROTO_RTMPS (1<<23) -#define CURLPROTO_RTMPTS (1<<24) -#define CURLPROTO_GOPHER (1<<25) -#define CURLPROTO_ALL (~0) /* enable everything */ - -/* long may be 32 or 64 bits, but we should never depend on anything else - but 32 */ -#define CURLOPTTYPE_LONG 0 -#define CURLOPTTYPE_OBJECTPOINT 10000 -#define CURLOPTTYPE_FUNCTIONPOINT 20000 -#define CURLOPTTYPE_OFF_T 30000 - -/* name is uppercase CURLOPT_, - type is one of the defined CURLOPTTYPE_ - number is unique identifier */ -#ifdef CINIT -#undef CINIT -#endif - -#ifdef CURL_ISOCPP -#define CINIT(name,type,number) CURLOPT_ ## name = CURLOPTTYPE_ ## type + number -#else -/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */ -#define LONG CURLOPTTYPE_LONG -#define OBJECTPOINT CURLOPTTYPE_OBJECTPOINT -#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT -#define OFF_T CURLOPTTYPE_OFF_T -#define CINIT(name,type,number) CURLOPT_/**/name = type + number -#endif - -/* - * This macro-mania below setups the CURLOPT_[what] enum, to be used with - * curl_easy_setopt(). The first argument in the CINIT() macro is the [what] - * word. - */ - -typedef enum { - /* This is the FILE * or void * the regular output should be written to. */ - CINIT(FILE, OBJECTPOINT, 1), - - /* The full URL to get/put */ - CINIT(URL, OBJECTPOINT, 2), - - /* Port number to connect to, if other than default. */ - CINIT(PORT, LONG, 3), - - /* Name of proxy to use. */ - CINIT(PROXY, OBJECTPOINT, 4), - - /* "name:password" to use when fetching. */ - CINIT(USERPWD, OBJECTPOINT, 5), - - /* "name:password" to use with proxy. */ - CINIT(PROXYUSERPWD, OBJECTPOINT, 6), - - /* Range to get, specified as an ASCII string. */ - CINIT(RANGE, OBJECTPOINT, 7), - - /* not used */ - - /* Specified file stream to upload from (use as input): */ - CINIT(INFILE, OBJECTPOINT, 9), - - /* Buffer to receive error messages in, must be at least CURL_ERROR_SIZE - * bytes big. If this is not used, error messages go to stderr instead: */ - CINIT(ERRORBUFFER, OBJECTPOINT, 10), - - /* Function that will be called to store the output (instead of fwrite). The - * parameters will use fwrite() syntax, make sure to follow them. */ - CINIT(WRITEFUNCTION, FUNCTIONPOINT, 11), - - /* Function that will be called to read the input (instead of fread). The - * parameters will use fread() syntax, make sure to follow them. */ - CINIT(READFUNCTION, FUNCTIONPOINT, 12), - - /* Time-out the read operation after this amount of seconds */ - CINIT(TIMEOUT, LONG, 13), - - /* If the CURLOPT_INFILE is used, this can be used to inform libcurl about - * how large the file being sent really is. That allows better error - * checking and better verifies that the upload was successful. -1 means - * unknown size. - * - * For large file support, there is also a _LARGE version of the key - * which takes an off_t type, allowing platforms with larger off_t - * sizes to handle larger files. See below for INFILESIZE_LARGE. - */ - CINIT(INFILESIZE, LONG, 14), - - /* POST static input fields. */ - CINIT(POSTFIELDS, OBJECTPOINT, 15), - - /* Set the referrer page (needed by some CGIs) */ - CINIT(REFERER, OBJECTPOINT, 16), - - /* Set the FTP PORT string (interface name, named or numerical IP address) - Use i.e '-' to use default address. */ - CINIT(FTPPORT, OBJECTPOINT, 17), - - /* Set the User-Agent string (examined by some CGIs) */ - CINIT(USERAGENT, OBJECTPOINT, 18), - - /* If the download receives less than "low speed limit" bytes/second - * during "low speed time" seconds, the operations is aborted. - * You could i.e if you have a pretty high speed connection, abort if - * it is less than 2000 bytes/sec during 20 seconds. - */ - - /* Set the "low speed limit" */ - CINIT(LOW_SPEED_LIMIT, LONG, 19), - - /* Set the "low speed time" */ - CINIT(LOW_SPEED_TIME, LONG, 20), - - /* Set the continuation offset. - * - * Note there is also a _LARGE version of this key which uses - * off_t types, allowing for large file offsets on platforms which - * use larger-than-32-bit off_t's. Look below for RESUME_FROM_LARGE. - */ - CINIT(RESUME_FROM, LONG, 21), - - /* Set cookie in request: */ - CINIT(COOKIE, OBJECTPOINT, 22), - - /* This points to a linked list of headers, struct curl_slist kind */ - CINIT(HTTPHEADER, OBJECTPOINT, 23), - - /* This points to a linked list of post entries, struct curl_httppost */ - CINIT(HTTPPOST, OBJECTPOINT, 24), - - /* name of the file keeping your private SSL-certificate */ - CINIT(SSLCERT, OBJECTPOINT, 25), - - /* password for the SSL or SSH private key */ - CINIT(KEYPASSWD, OBJECTPOINT, 26), - - /* send TYPE parameter? */ - CINIT(CRLF, LONG, 27), - - /* send linked-list of QUOTE commands */ - CINIT(QUOTE, OBJECTPOINT, 28), - - /* send FILE * or void * to store headers to, if you use a callback it - is simply passed to the callback unmodified */ - CINIT(WRITEHEADER, OBJECTPOINT, 29), - - /* point to a file to read the initial cookies from, also enables - "cookie awareness" */ - CINIT(COOKIEFILE, OBJECTPOINT, 31), - - /* What version to specifically try to use. - See CURL_SSLVERSION defines below. */ - CINIT(SSLVERSION, LONG, 32), - - /* What kind of HTTP time condition to use, see defines */ - CINIT(TIMECONDITION, LONG, 33), - - /* Time to use with the above condition. Specified in number of seconds - since 1 Jan 1970 */ - CINIT(TIMEVALUE, LONG, 34), - - /* 35 = OBSOLETE */ - - /* Custom request, for customizing the get command like - HTTP: DELETE, TRACE and others - FTP: to use a different list command - */ - CINIT(CUSTOMREQUEST, OBJECTPOINT, 36), - - /* HTTP request, for odd commands like DELETE, TRACE and others */ - CINIT(STDERR, OBJECTPOINT, 37), - - /* 38 is not used */ - - /* send linked-list of post-transfer QUOTE commands */ - CINIT(POSTQUOTE, OBJECTPOINT, 39), - - /* Pass a pointer to string of the output using full variable-replacement - as described elsewhere. */ - CINIT(WRITEINFO, OBJECTPOINT, 40), - - CINIT(VERBOSE, LONG, 41), /* talk a lot */ - CINIT(HEADER, LONG, 42), /* throw the header out too */ - CINIT(NOPROGRESS, LONG, 43), /* shut off the progress meter */ - CINIT(NOBODY, LONG, 44), /* use HEAD to get http document */ - CINIT(FAILONERROR, LONG, 45), /* no output on http error codes >= 300 */ - CINIT(UPLOAD, LONG, 46), /* this is an upload */ - CINIT(POST, LONG, 47), /* HTTP POST method */ - CINIT(DIRLISTONLY, LONG, 48), /* return bare names when listing directories */ - - CINIT(APPEND, LONG, 50), /* Append instead of overwrite on upload! */ - - /* Specify whether to read the user+password from the .netrc or the URL. - * This must be one of the CURL_NETRC_* enums below. */ - CINIT(NETRC, LONG, 51), - - CINIT(FOLLOWLOCATION, LONG, 52), /* use Location: Luke! */ - - CINIT(TRANSFERTEXT, LONG, 53), /* transfer data in text/ASCII format */ - CINIT(PUT, LONG, 54), /* HTTP PUT */ - - /* 55 = OBSOLETE */ - - /* Function that will be called instead of the internal progress display - * function. This function should be defined as the curl_progress_callback - * prototype defines. */ - CINIT(PROGRESSFUNCTION, FUNCTIONPOINT, 56), - - /* Data passed to the progress callback */ - CINIT(PROGRESSDATA, OBJECTPOINT, 57), - - /* We want the referrer field set automatically when following locations */ - CINIT(AUTOREFERER, LONG, 58), - - /* Port of the proxy, can be set in the proxy string as well with: - "[host]:[port]" */ - CINIT(PROXYPORT, LONG, 59), - - /* size of the POST input data, if strlen() is not good to use */ - CINIT(POSTFIELDSIZE, LONG, 60), - - /* tunnel non-http operations through a HTTP proxy */ - CINIT(HTTPPROXYTUNNEL, LONG, 61), - - /* Set the interface string to use as outgoing network interface */ - CINIT(INTERFACE, OBJECTPOINT, 62), - - /* Set the krb4/5 security level, this also enables krb4/5 awareness. This - * is a string, 'clear', 'safe', 'confidential' or 'private'. If the string - * is set but doesn't match one of these, 'private' will be used. */ - CINIT(KRBLEVEL, OBJECTPOINT, 63), - - /* Set if we should verify the peer in ssl handshake, set 1 to verify. */ - CINIT(SSL_VERIFYPEER, LONG, 64), - - /* The CApath or CAfile used to validate the peer certificate - this option is used only if SSL_VERIFYPEER is true */ - CINIT(CAINFO, OBJECTPOINT, 65), - - /* 66 = OBSOLETE */ - /* 67 = OBSOLETE */ - - /* Maximum number of http redirects to follow */ - CINIT(MAXREDIRS, LONG, 68), - - /* Pass a long set to 1 to get the date of the requested document (if - possible)! Pass a zero to shut it off. */ - CINIT(FILETIME, LONG, 69), - - /* This points to a linked list of telnet options */ - CINIT(TELNETOPTIONS, OBJECTPOINT, 70), - - /* Max amount of cached alive connections */ - CINIT(MAXCONNECTS, LONG, 71), - - /* What policy to use when closing connections when the cache is filled - up */ - CINIT(CLOSEPOLICY, LONG, 72), - - /* 73 = OBSOLETE */ - - /* Set to explicitly use a new connection for the upcoming transfer. - Do not use this unless you're absolutely sure of this, as it makes the - operation slower and is less friendly for the network. */ - CINIT(FRESH_CONNECT, LONG, 74), - - /* Set to explicitly forbid the upcoming transfer's connection to be re-used - when done. Do not use this unless you're absolutely sure of this, as it - makes the operation slower and is less friendly for the network. */ - CINIT(FORBID_REUSE, LONG, 75), - - /* Set to a file name that contains random data for libcurl to use to - seed the random engine when doing SSL connects. */ - CINIT(RANDOM_FILE, OBJECTPOINT, 76), - - /* Set to the Entropy Gathering Daemon socket pathname */ - CINIT(EGDSOCKET, OBJECTPOINT, 77), - - /* Time-out connect operations after this amount of seconds, if connects - are OK within this time, then fine... This only aborts the connect - phase. [Only works on unix-style/SIGALRM operating systems] */ - CINIT(CONNECTTIMEOUT, LONG, 78), - - /* Function that will be called to store headers (instead of fwrite). The - * parameters will use fwrite() syntax, make sure to follow them. */ - CINIT(HEADERFUNCTION, FUNCTIONPOINT, 79), - - /* Set this to force the HTTP request to get back to GET. Only really usable - if POST, PUT or a custom request have been used first. - */ - CINIT(HTTPGET, LONG, 80), - - /* Set if we should verify the Common name from the peer certificate in ssl - * handshake, set 1 to check existence, 2 to ensure that it matches the - * provided hostname. */ - CINIT(SSL_VERIFYHOST, LONG, 81), - - /* Specify which file name to write all known cookies in after completed - operation. Set file name to "-" (dash) to make it go to stdout. */ - CINIT(COOKIEJAR, OBJECTPOINT, 82), - - /* Specify which SSL ciphers to use */ - CINIT(SSL_CIPHER_LIST, OBJECTPOINT, 83), - - /* Specify which HTTP version to use! This must be set to one of the - CURL_HTTP_VERSION* enums set below. */ - CINIT(HTTP_VERSION, LONG, 84), - - /* Specifically switch on or off the FTP engine's use of the EPSV command. By - default, that one will always be attempted before the more traditional - PASV command. */ - CINIT(FTP_USE_EPSV, LONG, 85), - - /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") */ - CINIT(SSLCERTTYPE, OBJECTPOINT, 86), - - /* name of the file keeping your private SSL-key */ - CINIT(SSLKEY, OBJECTPOINT, 87), - - /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") */ - CINIT(SSLKEYTYPE, OBJECTPOINT, 88), - - /* crypto engine for the SSL-sub system */ - CINIT(SSLENGINE, OBJECTPOINT, 89), - - /* set the crypto engine for the SSL-sub system as default - the param has no meaning... - */ - CINIT(SSLENGINE_DEFAULT, LONG, 90), - - /* Non-zero value means to use the global dns cache */ - CINIT(DNS_USE_GLOBAL_CACHE, LONG, 91), /* To become OBSOLETE soon */ - - /* DNS cache timeout */ - CINIT(DNS_CACHE_TIMEOUT, LONG, 92), - - /* send linked-list of pre-transfer QUOTE commands */ - CINIT(PREQUOTE, OBJECTPOINT, 93), - - /* set the debug function */ - CINIT(DEBUGFUNCTION, FUNCTIONPOINT, 94), - - /* set the data for the debug function */ - CINIT(DEBUGDATA, OBJECTPOINT, 95), - - /* mark this as start of a cookie session */ - CINIT(COOKIESESSION, LONG, 96), - - /* The CApath directory used to validate the peer certificate - this option is used only if SSL_VERIFYPEER is true */ - CINIT(CAPATH, OBJECTPOINT, 97), - - /* Instruct libcurl to use a smaller receive buffer */ - CINIT(BUFFERSIZE, LONG, 98), - - /* Instruct libcurl to not use any signal/alarm handlers, even when using - timeouts. This option is useful for multi-threaded applications. - See libcurl-the-guide for more background information. */ - CINIT(NOSIGNAL, LONG, 99), - - /* Provide a CURLShare for mutexing non-ts data */ - CINIT(SHARE, OBJECTPOINT, 100), - - /* indicates type of proxy. accepted values are CURLPROXY_HTTP (default), - CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and CURLPROXY_SOCKS5. */ - CINIT(PROXYTYPE, LONG, 101), - - /* Set the Accept-Encoding string. Use this to tell a server you would like - the response to be compressed. */ - CINIT(ENCODING, OBJECTPOINT, 102), - - /* Set pointer to private data */ - CINIT(PRIVATE, OBJECTPOINT, 103), - - /* Set aliases for HTTP 200 in the HTTP Response header */ - CINIT(HTTP200ALIASES, OBJECTPOINT, 104), - - /* Continue to send authentication (user+password) when following locations, - even when hostname changed. This can potentially send off the name - and password to whatever host the server decides. */ - CINIT(UNRESTRICTED_AUTH, LONG, 105), - - /* Specifically switch on or off the FTP engine's use of the EPRT command ( it - also disables the LPRT attempt). By default, those ones will always be - attempted before the good old traditional PORT command. */ - CINIT(FTP_USE_EPRT, LONG, 106), - - /* Set this to a bitmask value to enable the particular authentications - methods you like. Use this in combination with CURLOPT_USERPWD. - Note that setting multiple bits may cause extra network round-trips. */ - CINIT(HTTPAUTH, LONG, 107), - - /* Set the ssl context callback function, currently only for OpenSSL ssl_ctx - in second argument. The function must be matching the - curl_ssl_ctx_callback proto. */ - CINIT(SSL_CTX_FUNCTION, FUNCTIONPOINT, 108), - - /* Set the userdata for the ssl context callback function's third - argument */ - CINIT(SSL_CTX_DATA, OBJECTPOINT, 109), - - /* FTP Option that causes missing dirs to be created on the remote server. - In 7.19.4 we introduced the convenience enums for this option using the - CURLFTP_CREATE_DIR prefix. - */ - CINIT(FTP_CREATE_MISSING_DIRS, LONG, 110), - - /* Set this to a bitmask value to enable the particular authentications - methods you like. Use this in combination with CURLOPT_PROXYUSERPWD. - Note that setting multiple bits may cause extra network round-trips. */ - CINIT(PROXYAUTH, LONG, 111), - - /* FTP option that changes the timeout, in seconds, associated with - getting a response. This is different from transfer timeout time and - essentially places a demand on the FTP server to acknowledge commands - in a timely manner. */ - CINIT(FTP_RESPONSE_TIMEOUT, LONG, 112), -#define CURLOPT_SERVER_RESPONSE_TIMEOUT CURLOPT_FTP_RESPONSE_TIMEOUT - - /* Set this option to one of the CURL_IPRESOLVE_* defines (see below) to - tell libcurl to resolve names to those IP versions only. This only has - affect on systems with support for more than one, i.e IPv4 _and_ IPv6. */ - CINIT(IPRESOLVE, LONG, 113), - - /* Set this option to limit the size of a file that will be downloaded from - an HTTP or FTP server. - - Note there is also _LARGE version which adds large file support for - platforms which have larger off_t sizes. See MAXFILESIZE_LARGE below. */ - CINIT(MAXFILESIZE, LONG, 114), - - /* See the comment for INFILESIZE above, but in short, specifies - * the size of the file being uploaded. -1 means unknown. - */ - CINIT(INFILESIZE_LARGE, OFF_T, 115), - - /* Sets the continuation offset. There is also a LONG version of this; - * look above for RESUME_FROM. - */ - CINIT(RESUME_FROM_LARGE, OFF_T, 116), - - /* Sets the maximum size of data that will be downloaded from - * an HTTP or FTP server. See MAXFILESIZE above for the LONG version. - */ - CINIT(MAXFILESIZE_LARGE, OFF_T, 117), - - /* Set this option to the file name of your .netrc file you want libcurl - to parse (using the CURLOPT_NETRC option). If not set, libcurl will do - a poor attempt to find the user's home directory and check for a .netrc - file in there. */ - CINIT(NETRC_FILE, OBJECTPOINT, 118), - - /* Enable SSL/TLS for FTP, pick one of: - CURLFTPSSL_TRY - try using SSL, proceed anyway otherwise - CURLFTPSSL_CONTROL - SSL for the control connection or fail - CURLFTPSSL_ALL - SSL for all communication or fail - */ - CINIT(USE_SSL, LONG, 119), - - /* The _LARGE version of the standard POSTFIELDSIZE option */ - CINIT(POSTFIELDSIZE_LARGE, OFF_T, 120), - - /* Enable/disable the TCP Nagle algorithm */ - CINIT(TCP_NODELAY, LONG, 121), - - /* 122 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 123 OBSOLETE. Gone in 7.16.0 */ - /* 124 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 125 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 126 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */ - /* 127 OBSOLETE. Gone in 7.16.0 */ - /* 128 OBSOLETE. Gone in 7.16.0 */ - - /* When FTP over SSL/TLS is selected (with CURLOPT_USE_SSL), this option - can be used to change libcurl's default action which is to first try - "AUTH SSL" and then "AUTH TLS" in this order, and proceed when a OK - response has been received. - - Available parameters are: - CURLFTPAUTH_DEFAULT - let libcurl decide - CURLFTPAUTH_SSL - try "AUTH SSL" first, then TLS - CURLFTPAUTH_TLS - try "AUTH TLS" first, then SSL - */ - CINIT(FTPSSLAUTH, LONG, 129), - - CINIT(IOCTLFUNCTION, FUNCTIONPOINT, 130), - CINIT(IOCTLDATA, OBJECTPOINT, 131), - - /* 132 OBSOLETE. Gone in 7.16.0 */ - /* 133 OBSOLETE. Gone in 7.16.0 */ - - /* zero terminated string for pass on to the FTP server when asked for - "account" info */ - CINIT(FTP_ACCOUNT, OBJECTPOINT, 134), - - /* feed cookies into cookie engine */ - CINIT(COOKIELIST, OBJECTPOINT, 135), - - /* ignore Content-Length */ - CINIT(IGNORE_CONTENT_LENGTH, LONG, 136), - - /* Set to non-zero to skip the IP address received in a 227 PASV FTP server - response. Typically used for FTP-SSL purposes but is not restricted to - that. libcurl will then instead use the same IP address it used for the - control connection. */ - CINIT(FTP_SKIP_PASV_IP, LONG, 137), - - /* Select "file method" to use when doing FTP, see the curl_ftpmethod - above. */ - CINIT(FTP_FILEMETHOD, LONG, 138), - - /* Local port number to bind the socket to */ - CINIT(LOCALPORT, LONG, 139), - - /* Number of ports to try, including the first one set with LOCALPORT. - Thus, setting it to 1 will make no additional attempts but the first. - */ - CINIT(LOCALPORTRANGE, LONG, 140), - - /* no transfer, set up connection and let application use the socket by - extracting it with CURLINFO_LASTSOCKET */ - CINIT(CONNECT_ONLY, LONG, 141), - - /* Function that will be called to convert from the - network encoding (instead of using the iconv calls in libcurl) */ - CINIT(CONV_FROM_NETWORK_FUNCTION, FUNCTIONPOINT, 142), - - /* Function that will be called to convert to the - network encoding (instead of using the iconv calls in libcurl) */ - CINIT(CONV_TO_NETWORK_FUNCTION, FUNCTIONPOINT, 143), - - /* Function that will be called to convert from UTF8 - (instead of using the iconv calls in libcurl) - Note that this is used only for SSL certificate processing */ - CINIT(CONV_FROM_UTF8_FUNCTION, FUNCTIONPOINT, 144), - - /* if the connection proceeds too quickly then need to slow it down */ - /* limit-rate: maximum number of bytes per second to send or receive */ - CINIT(MAX_SEND_SPEED_LARGE, OFF_T, 145), - CINIT(MAX_RECV_SPEED_LARGE, OFF_T, 146), - - /* Pointer to command string to send if USER/PASS fails. */ - CINIT(FTP_ALTERNATIVE_TO_USER, OBJECTPOINT, 147), - - /* callback function for setting socket options */ - CINIT(SOCKOPTFUNCTION, FUNCTIONPOINT, 148), - CINIT(SOCKOPTDATA, OBJECTPOINT, 149), - - /* set to 0 to disable session ID re-use for this transfer, default is - enabled (== 1) */ - CINIT(SSL_SESSIONID_CACHE, LONG, 150), - - /* allowed SSH authentication methods */ - CINIT(SSH_AUTH_TYPES, LONG, 151), - - /* Used by scp/sftp to do public/private key authentication */ - CINIT(SSH_PUBLIC_KEYFILE, OBJECTPOINT, 152), - CINIT(SSH_PRIVATE_KEYFILE, OBJECTPOINT, 153), - - /* Send CCC (Clear Command Channel) after authentication */ - CINIT(FTP_SSL_CCC, LONG, 154), - - /* Same as TIMEOUT and CONNECTTIMEOUT, but with ms resolution */ - CINIT(TIMEOUT_MS, LONG, 155), - CINIT(CONNECTTIMEOUT_MS, LONG, 156), - - /* set to zero to disable the libcurl's decoding and thus pass the raw body - data to the application even when it is encoded/compressed */ - CINIT(HTTP_TRANSFER_DECODING, LONG, 157), - CINIT(HTTP_CONTENT_DECODING, LONG, 158), - - /* Permission used when creating new files and directories on the remote - server for protocols that support it, SFTP/SCP/FILE */ - CINIT(NEW_FILE_PERMS, LONG, 159), - CINIT(NEW_DIRECTORY_PERMS, LONG, 160), - - /* Set the behaviour of POST when redirecting. Values must be set to one - of CURL_REDIR* defines below. This used to be called CURLOPT_POST301 */ - CINIT(POSTREDIR, LONG, 161), - - /* used by scp/sftp to verify the host's public key */ - CINIT(SSH_HOST_PUBLIC_KEY_MD5, OBJECTPOINT, 162), - - /* Callback function for opening socket (instead of socket(2)). Optionally, - callback is able change the address or refuse to connect returning - CURL_SOCKET_BAD. The callback should have type - curl_opensocket_callback */ - CINIT(OPENSOCKETFUNCTION, FUNCTIONPOINT, 163), - CINIT(OPENSOCKETDATA, OBJECTPOINT, 164), - - /* POST volatile input fields. */ - CINIT(COPYPOSTFIELDS, OBJECTPOINT, 165), - - /* set transfer mode (;type=) when doing FTP via an HTTP proxy */ - CINIT(PROXY_TRANSFER_MODE, LONG, 166), - - /* Callback function for seeking in the input stream */ - CINIT(SEEKFUNCTION, FUNCTIONPOINT, 167), - CINIT(SEEKDATA, OBJECTPOINT, 168), - - /* CRL file */ - CINIT(CRLFILE, OBJECTPOINT, 169), - - /* Issuer certificate */ - CINIT(ISSUERCERT, OBJECTPOINT, 170), - - /* (IPv6) Address scope */ - CINIT(ADDRESS_SCOPE, LONG, 171), - - /* Collect certificate chain info and allow it to get retrievable with - CURLINFO_CERTINFO after the transfer is complete. (Unfortunately) only - working with OpenSSL-powered builds. */ - CINIT(CERTINFO, LONG, 172), - - /* "name" and "pwd" to use when fetching. */ - CINIT(USERNAME, OBJECTPOINT, 173), - CINIT(PASSWORD, OBJECTPOINT, 174), - - /* "name" and "pwd" to use with Proxy when fetching. */ - CINIT(PROXYUSERNAME, OBJECTPOINT, 175), - CINIT(PROXYPASSWORD, OBJECTPOINT, 176), - - /* Comma separated list of hostnames defining no-proxy zones. These should - match both hostnames directly, and hostnames within a domain. For - example, local.com will match local.com and www.local.com, but NOT - notlocal.com or www.notlocal.com. For compatibility with other - implementations of this, .local.com will be considered to be the same as - local.com. A single * is the only valid wildcard, and effectively - disables the use of proxy. */ - CINIT(NOPROXY, OBJECTPOINT, 177), - - /* block size for TFTP transfers */ - CINIT(TFTP_BLKSIZE, LONG, 178), - - /* Socks Service */ - CINIT(SOCKS5_GSSAPI_SERVICE, OBJECTPOINT, 179), - - /* Socks Service */ - CINIT(SOCKS5_GSSAPI_NEC, LONG, 180), - - /* set the bitmask for the protocols that are allowed to be used for the - transfer, which thus helps the app which takes URLs from users or other - external inputs and want to restrict what protocol(s) to deal - with. Defaults to CURLPROTO_ALL. */ - CINIT(PROTOCOLS, LONG, 181), - - /* set the bitmask for the protocols that libcurl is allowed to follow to, - as a subset of the CURLOPT_PROTOCOLS ones. That means the protocol needs - to be set in both bitmasks to be allowed to get redirected to. Defaults - to all protocols except FILE and SCP. */ - CINIT(REDIR_PROTOCOLS, LONG, 182), - - /* set the SSH knownhost file name to use */ - CINIT(SSH_KNOWNHOSTS, OBJECTPOINT, 183), - - /* set the SSH host key callback, must point to a curl_sshkeycallback - function */ - CINIT(SSH_KEYFUNCTION, FUNCTIONPOINT, 184), - - /* set the SSH host key callback custom pointer */ - CINIT(SSH_KEYDATA, OBJECTPOINT, 185), - - /* set the SMTP mail originator */ - CINIT(MAIL_FROM, OBJECTPOINT, 186), - - /* set the SMTP mail receiver(s) */ - CINIT(MAIL_RCPT, OBJECTPOINT, 187), - - /* FTP: send PRET before PASV */ - CINIT(FTP_USE_PRET, LONG, 188), - - /* RTSP request method (OPTIONS, SETUP, PLAY, etc...) */ - CINIT(RTSP_REQUEST, LONG, 189), - - /* The RTSP session identifier */ - CINIT(RTSP_SESSION_ID, OBJECTPOINT, 190), - - /* The RTSP stream URI */ - CINIT(RTSP_STREAM_URI, OBJECTPOINT, 191), - - /* The Transport: header to use in RTSP requests */ - CINIT(RTSP_TRANSPORT, OBJECTPOINT, 192), - - /* Manually initialize the client RTSP CSeq for this handle */ - CINIT(RTSP_CLIENT_CSEQ, LONG, 193), - - /* Manually initialize the server RTSP CSeq for this handle */ - CINIT(RTSP_SERVER_CSEQ, LONG, 194), - - /* The stream to pass to INTERLEAVEFUNCTION. */ - CINIT(INTERLEAVEDATA, OBJECTPOINT, 195), - - /* Let the application define a custom write method for RTP data */ - CINIT(INTERLEAVEFUNCTION, FUNCTIONPOINT, 196), - - /* Turn on wildcard matching */ - CINIT(WILDCARDMATCH, LONG, 197), - - /* Directory matching callback called before downloading of an - individual file (chunk) started */ - CINIT(CHUNK_BGN_FUNCTION, FUNCTIONPOINT, 198), - - /* Directory matching callback called after the file (chunk) - was downloaded, or skipped */ - CINIT(CHUNK_END_FUNCTION, FUNCTIONPOINT, 199), - - /* Change match (fnmatch-like) callback for wildcard matching */ - CINIT(FNMATCH_FUNCTION, FUNCTIONPOINT, 200), - - /* Let the application define custom chunk data pointer */ - CINIT(CHUNK_DATA, OBJECTPOINT, 201), - - /* FNMATCH_FUNCTION user pointer */ - CINIT(FNMATCH_DATA, OBJECTPOINT, 202), - - CURLOPT_LASTENTRY /* the last unused */ -} CURLoption; - -#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all - the obsolete stuff removed! */ - -/* Backwards compatibility with older names */ -/* These are scheduled to disappear by 2011 */ - -/* This was added in version 7.19.1 */ -#define CURLOPT_POST301 CURLOPT_POSTREDIR - -/* These are scheduled to disappear by 2009 */ - -/* The following were added in 7.17.0 */ -#define CURLOPT_SSLKEYPASSWD CURLOPT_KEYPASSWD -#define CURLOPT_FTPAPPEND CURLOPT_APPEND -#define CURLOPT_FTPLISTONLY CURLOPT_DIRLISTONLY -#define CURLOPT_FTP_SSL CURLOPT_USE_SSL - -/* The following were added earlier */ - -#define CURLOPT_SSLCERTPASSWD CURLOPT_KEYPASSWD -#define CURLOPT_KRB4LEVEL CURLOPT_KRBLEVEL - -#else -/* This is set if CURL_NO_OLDIES is defined at compile-time */ -#undef CURLOPT_DNS_USE_GLOBAL_CACHE /* soon obsolete */ -#endif - - - /* Below here follows defines for the CURLOPT_IPRESOLVE option. If a host - name resolves addresses using more than one IP protocol version, this - option might be handy to force libcurl to use a specific IP version. */ -#define CURL_IPRESOLVE_WHATEVER 0 /* default, resolves addresses to all IP - versions that your system allows */ -#define CURL_IPRESOLVE_V4 1 /* resolve to ipv4 addresses */ -#define CURL_IPRESOLVE_V6 2 /* resolve to ipv6 addresses */ - - /* three convenient "aliases" that follow the name scheme better */ -#define CURLOPT_WRITEDATA CURLOPT_FILE -#define CURLOPT_READDATA CURLOPT_INFILE -#define CURLOPT_HEADERDATA CURLOPT_WRITEHEADER -#define CURLOPT_RTSPHEADER CURLOPT_HTTPHEADER - - /* These enums are for use with the CURLOPT_HTTP_VERSION option. */ -enum { - CURL_HTTP_VERSION_NONE, /* setting this means we don't care, and that we'd - like the library to choose the best possible - for us! */ - CURL_HTTP_VERSION_1_0, /* please use HTTP 1.0 in the request */ - CURL_HTTP_VERSION_1_1, /* please use HTTP 1.1 in the request */ - - CURL_HTTP_VERSION_LAST /* *ILLEGAL* http version */ -}; - -/* - * Public API enums for RTSP requests - */ -enum { - CURL_RTSPREQ_NONE, /* first in list */ - CURL_RTSPREQ_OPTIONS, - CURL_RTSPREQ_DESCRIBE, - CURL_RTSPREQ_ANNOUNCE, - CURL_RTSPREQ_SETUP, - CURL_RTSPREQ_PLAY, - CURL_RTSPREQ_PAUSE, - CURL_RTSPREQ_TEARDOWN, - CURL_RTSPREQ_GET_PARAMETER, - CURL_RTSPREQ_SET_PARAMETER, - CURL_RTSPREQ_RECORD, - CURL_RTSPREQ_RECEIVE, - CURL_RTSPREQ_LAST /* last in list */ -}; - - /* These enums are for use with the CURLOPT_NETRC option. */ -enum CURL_NETRC_OPTION { - CURL_NETRC_IGNORED, /* The .netrc will never be read. - * This is the default. */ - CURL_NETRC_OPTIONAL, /* A user:password in the URL will be preferred - * to one in the .netrc. */ - CURL_NETRC_REQUIRED, /* A user:password in the URL will be ignored. - * Unless one is set programmatically, the .netrc - * will be queried. */ - CURL_NETRC_LAST -}; - -enum { - CURL_SSLVERSION_DEFAULT, - CURL_SSLVERSION_TLSv1, - CURL_SSLVERSION_SSLv2, - CURL_SSLVERSION_SSLv3, - - CURL_SSLVERSION_LAST /* never use, keep last */ -}; - -/* symbols to use with CURLOPT_POSTREDIR. - CURL_REDIR_POST_301 and CURL_REDIR_POST_302 can be bitwise ORed so that - CURL_REDIR_POST_301 | CURL_REDIR_POST_302 == CURL_REDIR_POST_ALL */ - -#define CURL_REDIR_GET_ALL 0 -#define CURL_REDIR_POST_301 1 -#define CURL_REDIR_POST_302 2 -#define CURL_REDIR_POST_ALL (CURL_REDIR_POST_301|CURL_REDIR_POST_302) - -typedef enum { - CURL_TIMECOND_NONE, - - CURL_TIMECOND_IFMODSINCE, - CURL_TIMECOND_IFUNMODSINCE, - CURL_TIMECOND_LASTMOD, - - CURL_TIMECOND_LAST -} curl_TimeCond; - - -/* curl_strequal() and curl_strnequal() are subject for removal in a future - libcurl, see lib/README.curlx for details */ -CURL_EXTERN int (curl_strequal)(const char *s1, const char *s2); -CURL_EXTERN int (curl_strnequal)(const char *s1, const char *s2, size_t n); - -/* name is uppercase CURLFORM_ */ -#ifdef CFINIT -#undef CFINIT -#endif - -#ifdef CURL_ISOCPP -#define CFINIT(name) CURLFORM_ ## name -#else -/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */ -#define CFINIT(name) CURLFORM_/**/name -#endif - -typedef enum { - CFINIT(NOTHING), /********* the first one is unused ************/ - - /* */ - CFINIT(COPYNAME), - CFINIT(PTRNAME), - CFINIT(NAMELENGTH), - CFINIT(COPYCONTENTS), - CFINIT(PTRCONTENTS), - CFINIT(CONTENTSLENGTH), - CFINIT(FILECONTENT), - CFINIT(ARRAY), - CFINIT(OBSOLETE), - CFINIT(FILE), - - CFINIT(BUFFER), - CFINIT(BUFFERPTR), - CFINIT(BUFFERLENGTH), - - CFINIT(CONTENTTYPE), - CFINIT(CONTENTHEADER), - CFINIT(FILENAME), - CFINIT(END), - CFINIT(OBSOLETE2), - - CFINIT(STREAM), - - CURLFORM_LASTENTRY /* the last unused */ -} CURLformoption; - -#undef CFINIT /* done */ - -/* structure to be used as parameter for CURLFORM_ARRAY */ -struct curl_forms { - CURLformoption option; - const char *value; -}; - -/* use this for multipart formpost building */ -/* Returns code for curl_formadd() - * - * Returns: - * CURL_FORMADD_OK on success - * CURL_FORMADD_MEMORY if the FormInfo allocation fails - * CURL_FORMADD_OPTION_TWICE if one option is given twice for one Form - * CURL_FORMADD_NULL if a null pointer was given for a char - * CURL_FORMADD_MEMORY if the allocation of a FormInfo struct failed - * CURL_FORMADD_UNKNOWN_OPTION if an unknown option was used - * CURL_FORMADD_INCOMPLETE if the some FormInfo is not complete (or error) - * CURL_FORMADD_MEMORY if a curl_httppost struct cannot be allocated - * CURL_FORMADD_MEMORY if some allocation for string copying failed. - * CURL_FORMADD_ILLEGAL_ARRAY if an illegal option is used in an array - * - ***************************************************************************/ -typedef enum { - CURL_FORMADD_OK, /* first, no error */ - - CURL_FORMADD_MEMORY, - CURL_FORMADD_OPTION_TWICE, - CURL_FORMADD_NULL, - CURL_FORMADD_UNKNOWN_OPTION, - CURL_FORMADD_INCOMPLETE, - CURL_FORMADD_ILLEGAL_ARRAY, - CURL_FORMADD_DISABLED, /* libcurl was built with this disabled */ - - CURL_FORMADD_LAST /* last */ -} CURLFORMcode; - -/* - * NAME curl_formadd() - * - * DESCRIPTION - * - * Pretty advanced function for building multi-part formposts. Each invoke - * adds one part that together construct a full post. Then use - * CURLOPT_HTTPPOST to send it off to libcurl. - */ -CURL_EXTERN CURLFORMcode curl_formadd(struct curl_httppost **httppost, - struct curl_httppost **last_post, - ...); - -/* - * callback function for curl_formget() - * The void *arg pointer will be the one passed as second argument to - * curl_formget(). - * The character buffer passed to it must not be freed. - * Should return the buffer length passed to it as the argument "len" on - * success. - */ -typedef size_t (*curl_formget_callback)(void *arg, const char *buf, size_t len); - -/* - * NAME curl_formget() - * - * DESCRIPTION - * - * Serialize a curl_httppost struct built with curl_formadd(). - * Accepts a void pointer as second argument which will be passed to - * the curl_formget_callback function. - * Returns 0 on success. - */ -CURL_EXTERN int curl_formget(struct curl_httppost *form, void *arg, - curl_formget_callback append); -/* - * NAME curl_formfree() - * - * DESCRIPTION - * - * Free a multipart formpost previously built with curl_formadd(). - */ -CURL_EXTERN void curl_formfree(struct curl_httppost *form); - -/* - * NAME curl_getenv() - * - * DESCRIPTION - * - * Returns a malloc()'ed string that MUST be curl_free()ed after usage is - * complete. DEPRECATED - see lib/README.curlx - */ -CURL_EXTERN char *curl_getenv(const char *variable); - -/* - * NAME curl_version() - * - * DESCRIPTION - * - * Returns a static ascii string of the libcurl version. - */ -CURL_EXTERN char *curl_version(void); - -/* - * NAME curl_easy_escape() - * - * DESCRIPTION - * - * Escapes URL strings (converts all letters consider illegal in URLs to their - * %XX versions). This function returns a new allocated string or NULL if an - * error occurred. - */ -CURL_EXTERN char *curl_easy_escape(CURL *handle, - const char *string, - int length); - -/* the previous version: */ -CURL_EXTERN char *curl_escape(const char *string, - int length); - - -/* - * NAME curl_easy_unescape() - * - * DESCRIPTION - * - * Unescapes URL encoding in strings (converts all %XX codes to their 8bit - * versions). This function returns a new allocated string or NULL if an error - * occurred. - * Conversion Note: On non-ASCII platforms the ASCII %XX codes are - * converted into the host encoding. - */ -CURL_EXTERN char *curl_easy_unescape(CURL *handle, - const char *string, - int length, - int *outlength); - -/* the previous version */ -CURL_EXTERN char *curl_unescape(const char *string, - int length); - -/* - * NAME curl_free() - * - * DESCRIPTION - * - * Provided for de-allocation in the same translation unit that did the - * allocation. Added in libcurl 7.10 - */ -CURL_EXTERN void curl_free(void *p); - -/* - * NAME curl_global_init() - * - * DESCRIPTION - * - * curl_global_init() should be invoked exactly once for each application that - * uses libcurl and before any call of other libcurl functions. - * - * This function is not thread-safe! - */ -CURL_EXTERN CURLcode curl_global_init(long flags); - -/* - * NAME curl_global_init_mem() - * - * DESCRIPTION - * - * curl_global_init() or curl_global_init_mem() should be invoked exactly once - * for each application that uses libcurl. This function can be used to - * initialize libcurl and set user defined memory management callback - * functions. Users can implement memory management routines to check for - * memory leaks, check for mis-use of the curl library etc. User registered - * callback routines with be invoked by this library instead of the system - * memory management routines like malloc, free etc. - */ -CURL_EXTERN CURLcode curl_global_init_mem(long flags, - curl_malloc_callback m, - curl_free_callback f, - curl_realloc_callback r, - curl_strdup_callback s, - curl_calloc_callback c); - -/* - * NAME curl_global_cleanup() - * - * DESCRIPTION - * - * curl_global_cleanup() should be invoked exactly once for each application - * that uses libcurl - */ -CURL_EXTERN void curl_global_cleanup(void); - -/* linked-list structure for the CURLOPT_QUOTE option (and other) */ -struct curl_slist { - char *data; - struct curl_slist *next; -}; - -/* - * NAME curl_slist_append() - * - * DESCRIPTION - * - * Appends a string to a linked list. If no list exists, it will be created - * first. Returns the new list, after appending. - */ -CURL_EXTERN struct curl_slist *curl_slist_append(struct curl_slist *, - const char *); - -/* - * NAME curl_slist_free_all() - * - * DESCRIPTION - * - * free a previously built curl_slist. - */ -CURL_EXTERN void curl_slist_free_all(struct curl_slist *); - -/* - * NAME curl_getdate() - * - * DESCRIPTION - * - * Returns the time, in seconds since 1 Jan 1970 of the time string given in - * the first argument. The time argument in the second parameter is unused - * and should be set to NULL. - */ -CURL_EXTERN time_t curl_getdate(const char *p, const time_t *unused); - -/* info about the certificate chain, only for OpenSSL builds. Asked - for with CURLOPT_CERTINFO / CURLINFO_CERTINFO */ -struct curl_certinfo { - int num_of_certs; /* number of certificates with information */ - struct curl_slist **certinfo; /* for each index in this array, there's a - linked list with textual information in the - format "name: value" */ -}; - -#define CURLINFO_STRING 0x100000 -#define CURLINFO_LONG 0x200000 -#define CURLINFO_DOUBLE 0x300000 -#define CURLINFO_SLIST 0x400000 -#define CURLINFO_MASK 0x0fffff -#define CURLINFO_TYPEMASK 0xf00000 - -typedef enum { - CURLINFO_NONE, /* first, never use this */ - CURLINFO_EFFECTIVE_URL = CURLINFO_STRING + 1, - CURLINFO_RESPONSE_CODE = CURLINFO_LONG + 2, - CURLINFO_TOTAL_TIME = CURLINFO_DOUBLE + 3, - CURLINFO_NAMELOOKUP_TIME = CURLINFO_DOUBLE + 4, - CURLINFO_CONNECT_TIME = CURLINFO_DOUBLE + 5, - CURLINFO_PRETRANSFER_TIME = CURLINFO_DOUBLE + 6, - CURLINFO_SIZE_UPLOAD = CURLINFO_DOUBLE + 7, - CURLINFO_SIZE_DOWNLOAD = CURLINFO_DOUBLE + 8, - CURLINFO_SPEED_DOWNLOAD = CURLINFO_DOUBLE + 9, - CURLINFO_SPEED_UPLOAD = CURLINFO_DOUBLE + 10, - CURLINFO_HEADER_SIZE = CURLINFO_LONG + 11, - CURLINFO_REQUEST_SIZE = CURLINFO_LONG + 12, - CURLINFO_SSL_VERIFYRESULT = CURLINFO_LONG + 13, - CURLINFO_FILETIME = CURLINFO_LONG + 14, - CURLINFO_CONTENT_LENGTH_DOWNLOAD = CURLINFO_DOUBLE + 15, - CURLINFO_CONTENT_LENGTH_UPLOAD = CURLINFO_DOUBLE + 16, - CURLINFO_STARTTRANSFER_TIME = CURLINFO_DOUBLE + 17, - CURLINFO_CONTENT_TYPE = CURLINFO_STRING + 18, - CURLINFO_REDIRECT_TIME = CURLINFO_DOUBLE + 19, - CURLINFO_REDIRECT_COUNT = CURLINFO_LONG + 20, - CURLINFO_PRIVATE = CURLINFO_STRING + 21, - CURLINFO_HTTP_CONNECTCODE = CURLINFO_LONG + 22, - CURLINFO_HTTPAUTH_AVAIL = CURLINFO_LONG + 23, - CURLINFO_PROXYAUTH_AVAIL = CURLINFO_LONG + 24, - CURLINFO_OS_ERRNO = CURLINFO_LONG + 25, - CURLINFO_NUM_CONNECTS = CURLINFO_LONG + 26, - CURLINFO_SSL_ENGINES = CURLINFO_SLIST + 27, - CURLINFO_COOKIELIST = CURLINFO_SLIST + 28, - CURLINFO_LASTSOCKET = CURLINFO_LONG + 29, - CURLINFO_FTP_ENTRY_PATH = CURLINFO_STRING + 30, - CURLINFO_REDIRECT_URL = CURLINFO_STRING + 31, - CURLINFO_PRIMARY_IP = CURLINFO_STRING + 32, - CURLINFO_APPCONNECT_TIME = CURLINFO_DOUBLE + 33, - CURLINFO_CERTINFO = CURLINFO_SLIST + 34, - CURLINFO_CONDITION_UNMET = CURLINFO_LONG + 35, - CURLINFO_RTSP_SESSION_ID = CURLINFO_STRING + 36, - CURLINFO_RTSP_CLIENT_CSEQ = CURLINFO_LONG + 37, - CURLINFO_RTSP_SERVER_CSEQ = CURLINFO_LONG + 38, - CURLINFO_RTSP_CSEQ_RECV = CURLINFO_LONG + 39, - CURLINFO_PRIMARY_PORT = CURLINFO_LONG + 40, - CURLINFO_LOCAL_IP = CURLINFO_STRING + 41, - CURLINFO_LOCAL_PORT = CURLINFO_LONG + 42, - /* Fill in new entries below here! */ - - CURLINFO_LASTONE = 42 -} CURLINFO; - -/* CURLINFO_RESPONSE_CODE is the new name for the option previously known as - CURLINFO_HTTP_CODE */ -#define CURLINFO_HTTP_CODE CURLINFO_RESPONSE_CODE - -typedef enum { - CURLCLOSEPOLICY_NONE, /* first, never use this */ - - CURLCLOSEPOLICY_OLDEST, - CURLCLOSEPOLICY_LEAST_RECENTLY_USED, - CURLCLOSEPOLICY_LEAST_TRAFFIC, - CURLCLOSEPOLICY_SLOWEST, - CURLCLOSEPOLICY_CALLBACK, - - CURLCLOSEPOLICY_LAST /* last, never use this */ -} curl_closepolicy; - -#define CURL_GLOBAL_SSL (1<<0) -#define CURL_GLOBAL_WIN32 (1<<1) -#define CURL_GLOBAL_ALL (CURL_GLOBAL_SSL|CURL_GLOBAL_WIN32) -#define CURL_GLOBAL_NOTHING 0 -#define CURL_GLOBAL_DEFAULT CURL_GLOBAL_ALL - - -/***************************************************************************** - * Setup defines, protos etc for the sharing stuff. - */ - -/* Different data locks for a single share */ -typedef enum { - CURL_LOCK_DATA_NONE = 0, - /* CURL_LOCK_DATA_SHARE is used internally to say that - * the locking is just made to change the internal state of the share - * itself. - */ - CURL_LOCK_DATA_SHARE, - CURL_LOCK_DATA_COOKIE, - CURL_LOCK_DATA_DNS, - CURL_LOCK_DATA_SSL_SESSION, - CURL_LOCK_DATA_CONNECT, - CURL_LOCK_DATA_LAST -} curl_lock_data; - -/* Different lock access types */ -typedef enum { - CURL_LOCK_ACCESS_NONE = 0, /* unspecified action */ - CURL_LOCK_ACCESS_SHARED = 1, /* for read perhaps */ - CURL_LOCK_ACCESS_SINGLE = 2, /* for write perhaps */ - CURL_LOCK_ACCESS_LAST /* never use */ -} curl_lock_access; - -typedef void (*curl_lock_function)(CURL *handle, - curl_lock_data data, - curl_lock_access locktype, - void *userptr); -typedef void (*curl_unlock_function)(CURL *handle, - curl_lock_data data, - void *userptr); - -typedef void CURLSH; - -typedef enum { - CURLSHE_OK, /* all is fine */ - CURLSHE_BAD_OPTION, /* 1 */ - CURLSHE_IN_USE, /* 2 */ - CURLSHE_INVALID, /* 3 */ - CURLSHE_NOMEM, /* out of memory */ - CURLSHE_LAST /* never use */ -} CURLSHcode; - -typedef enum { - CURLSHOPT_NONE, /* don't use */ - CURLSHOPT_SHARE, /* specify a data type to share */ - CURLSHOPT_UNSHARE, /* specify which data type to stop sharing */ - CURLSHOPT_LOCKFUNC, /* pass in a 'curl_lock_function' pointer */ - CURLSHOPT_UNLOCKFUNC, /* pass in a 'curl_unlock_function' pointer */ - CURLSHOPT_USERDATA, /* pass in a user data pointer used in the lock/unlock - callback functions */ - CURLSHOPT_LAST /* never use */ -} CURLSHoption; - -CURL_EXTERN CURLSH *curl_share_init(void); -CURL_EXTERN CURLSHcode curl_share_setopt(CURLSH *, CURLSHoption option, ...); -CURL_EXTERN CURLSHcode curl_share_cleanup(CURLSH *); - -/**************************************************************************** - * Structures for querying information about the curl library at runtime. - */ - -typedef enum { - CURLVERSION_FIRST, - CURLVERSION_SECOND, - CURLVERSION_THIRD, - CURLVERSION_FOURTH, - CURLVERSION_LAST /* never actually use this */ -} CURLversion; - -/* The 'CURLVERSION_NOW' is the symbolic name meant to be used by - basically all programs ever that want to get version information. It is - meant to be a built-in version number for what kind of struct the caller - expects. If the struct ever changes, we redefine the NOW to another enum - from above. */ -#define CURLVERSION_NOW CURLVERSION_FOURTH - -typedef struct { - CURLversion age; /* age of the returned struct */ - const char *version; /* LIBCURL_VERSION */ - unsigned int version_num; /* LIBCURL_VERSION_NUM */ - const char *host; /* OS/host/cpu/machine when configured */ - int features; /* bitmask, see defines below */ - const char *ssl_version; /* human readable string */ - long ssl_version_num; /* not used anymore, always 0 */ - const char *libz_version; /* human readable string */ - /* protocols is terminated by an entry with a NULL protoname */ - const char * const *protocols; - - /* The fields below this were added in CURLVERSION_SECOND */ - const char *ares; - int ares_num; - - /* This field was added in CURLVERSION_THIRD */ - const char *libidn; - - /* These field were added in CURLVERSION_FOURTH */ - - /* Same as '_libiconv_version' if built with HAVE_ICONV */ - int iconv_ver_num; - - const char *libssh_version; /* human readable string */ - -} curl_version_info_data; - -#define CURL_VERSION_IPV6 (1<<0) /* IPv6-enabled */ -#define CURL_VERSION_KERBEROS4 (1<<1) /* kerberos auth is supported */ -#define CURL_VERSION_SSL (1<<2) /* SSL options are present */ -#define CURL_VERSION_LIBZ (1<<3) /* libz features are present */ -#define CURL_VERSION_NTLM (1<<4) /* NTLM auth is supported */ -#define CURL_VERSION_GSSNEGOTIATE (1<<5) /* Negotiate auth support */ -#define CURL_VERSION_DEBUG (1<<6) /* built with debug capabilities */ -#define CURL_VERSION_ASYNCHDNS (1<<7) /* asynchronous dns resolves */ -#define CURL_VERSION_SPNEGO (1<<8) /* SPNEGO auth */ -#define CURL_VERSION_LARGEFILE (1<<9) /* supports files bigger than 2GB */ -#define CURL_VERSION_IDN (1<<10) /* International Domain Names support */ -#define CURL_VERSION_SSPI (1<<11) /* SSPI is supported */ -#define CURL_VERSION_CONV (1<<12) /* character conversions supported */ -#define CURL_VERSION_CURLDEBUG (1<<13) /* debug memory tracking supported */ - -/* - * NAME curl_version_info() - * - * DESCRIPTION - * - * This function returns a pointer to a static copy of the version info - * struct. See above. - */ -CURL_EXTERN curl_version_info_data *curl_version_info(CURLversion); - -/* - * NAME curl_easy_strerror() - * - * DESCRIPTION - * - * The curl_easy_strerror function may be used to turn a CURLcode value - * into the equivalent human readable error string. This is useful - * for printing meaningful error messages. - */ -CURL_EXTERN const char *curl_easy_strerror(CURLcode); - -/* - * NAME curl_share_strerror() - * - * DESCRIPTION - * - * The curl_share_strerror function may be used to turn a CURLSHcode value - * into the equivalent human readable error string. This is useful - * for printing meaningful error messages. - */ -CURL_EXTERN const char *curl_share_strerror(CURLSHcode); - -/* - * NAME curl_easy_pause() - * - * DESCRIPTION - * - * The curl_easy_pause function pauses or unpauses transfers. Select the new - * state by setting the bitmask, use the convenience defines below. - * - */ -CURL_EXTERN CURLcode curl_easy_pause(CURL *handle, int bitmask); - -#define CURLPAUSE_RECV (1<<0) -#define CURLPAUSE_RECV_CONT (0) - -#define CURLPAUSE_SEND (1<<2) -#define CURLPAUSE_SEND_CONT (0) - -#define CURLPAUSE_ALL (CURLPAUSE_RECV|CURLPAUSE_SEND) -#define CURLPAUSE_CONT (CURLPAUSE_RECV_CONT|CURLPAUSE_SEND_CONT) - -#ifdef __cplusplus -} -#endif - -/* unfortunately, the easy.h and multi.h include files need options and info - stuff before they can be included! */ -#include "easy.h" /* nothing in curl is fun without the easy stuff */ -#include "multi.h" - -/* the typechecker doesn't work in C++ (yet) */ -#if defined(__GNUC__) && defined(__GNUC_MINOR__) && \ - ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) && \ - !defined(__cplusplus) && !defined(CURL_DISABLE_TYPECHECK) -#include "typecheck-gcc.h" -#else -#if defined(__STDC__) && (__STDC__ >= 1) -/* This preprocessor magic that replaces a call with the exact same call is - only done to make sure application authors pass exactly three arguments - to these functions. */ -#define curl_easy_setopt(handle,opt,param) curl_easy_setopt(handle,opt,param) -#define curl_easy_getinfo(handle,info,arg) curl_easy_getinfo(handle,info,arg) -#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param) -#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param) -#endif /* __STDC__ >= 1 */ -#endif /* gcc >= 4.3 && !__cplusplus */ - -#endif /* __CURL_CURL_H */ diff --git a/common/curl/curlbuild.h b/common/curl/curlbuild.h deleted file mode 100644 index d0b32acb..00000000 --- a/common/curl/curlbuild.h +++ /dev/null @@ -1,583 +0,0 @@ -#ifndef __CURL_CURLBUILD_H -#define __CURL_CURLBUILD_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* ================================================================ */ -/* NOTES FOR CONFIGURE CAPABLE SYSTEMS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * See file include/curl/curlbuild.h.in, run configure, and forget - * that this file exists it is only used for non-configure systems. - * But you can keep reading if you want ;-) - * - */ - -/* ================================================================ */ -/* NOTES FOR NON-CONFIGURE SYSTEMS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * Nothing in this file is intended to be modified or adjusted by the - * curl library user nor by the curl library builder. - * - * If you think that something actually needs to be changed, adjusted - * or fixed in this file, then, report it on the libcurl development - * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/ - * - * Try to keep one section per platform, compiler and architecture, - * otherwise, if an existing section is reused for a different one and - * later on the original is adjusted, probably the piggybacking one can - * be adversely changed. - * - * In order to differentiate between platforms/compilers/architectures - * use only compiler built in predefined preprocessor symbols. - * - * This header file shall only export symbols which are 'curl' or 'CURL' - * prefixed, otherwise public name space would be polluted. - * - * NOTE 2: - * ------- - * - * For any given platform/compiler curl_off_t must be typedef'ed to a - * 64-bit wide signed integral data type. The width of this data type - * must remain constant and independent of any possible large file - * support settings. - * - * As an exception to the above, curl_off_t shall be typedef'ed to a - * 32-bit wide signed integral data type if there is no 64-bit type. - * - * As a general rule, curl_off_t shall not be mapped to off_t. This - * rule shall only be violated if off_t is the only 64-bit data type - * available and the size of off_t is independent of large file support - * settings. Keep your build on the safe side avoiding an off_t gating. - * If you have a 64-bit off_t then take for sure that another 64-bit - * data type exists, dig deeper and you will find it. - * - * NOTE 3: - * ------- - * - * Right now you might be staring at file include/curl/curlbuild.h.dist or - * at file include/curl/curlbuild.h, this is due to the following reason: - * file include/curl/curlbuild.h.dist is renamed to include/curl/curlbuild.h - * when the libcurl source code distribution archive file is created. - * - * File include/curl/curlbuild.h.dist is not included in the distribution - * archive. File include/curl/curlbuild.h is not present in the git tree. - * - * The distributed include/curl/curlbuild.h file is only intended to be used - * on systems which can not run the also distributed configure script. - * - * On systems capable of running the configure script, the configure process - * will overwrite the distributed include/curl/curlbuild.h file with one that - * is suitable and specific to the library being configured and built, which - * is generated from the include/curl/curlbuild.h.in template file. - * - * If you check out from git on a non-configure platform, you must run the - * appropriate buildconf* script to set up curlbuild.h and other local files. - * - */ - -/* ================================================================ */ -/* DEFINITION OF THESE SYMBOLS SHALL NOT TAKE PLACE ANYWHERE ELSE */ -/* ================================================================ */ - -#ifdef CURL_SIZEOF_LONG -# error "CURL_SIZEOF_LONG shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_LONG_already_defined -#endif - -#ifdef CURL_TYPEOF_CURL_SOCKLEN_T -# error "CURL_TYPEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_SOCKLEN_T -# error "CURL_SIZEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_already_defined -#endif - -#ifdef CURL_TYPEOF_CURL_OFF_T -# error "CURL_TYPEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_T -# error "CURL_FORMAT_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_FORMAT_CURL_OFF_TU -# error "CURL_FORMAT_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_already_defined -#endif - -#ifdef CURL_FORMAT_OFF_T -# error "CURL_FORMAT_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_FORMAT_OFF_T_already_defined -#endif - -#ifdef CURL_SIZEOF_CURL_OFF_T -# error "CURL_SIZEOF_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_T -# error "CURL_SUFFIX_CURL_OFF_T shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_already_defined -#endif - -#ifdef CURL_SUFFIX_CURL_OFF_TU -# error "CURL_SUFFIX_CURL_OFF_TU shall not be defined except in curlbuild.h" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_already_defined -#endif - -/* ================================================================ */ -/* EXTERNAL INTERFACE SETTINGS FOR NON-CONFIGURE SYSTEMS ONLY */ -/* ================================================================ */ - -#if defined(__DJGPP__) || defined(__GO32__) -# if defined(__DJGPP__) && (__DJGPP__ > 1) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__SALFORDC__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__BORLANDC__) -# if (__BORLANDC__ < 0x520) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__TURBOC__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__WATCOMC__) -# if defined(__386__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__POCC__) -# if (__POCC__ < 280) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# elif defined(_MSC_VER) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__LCC__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__SYMBIAN32__) -# if defined(__EABI__) /* Treat all ARM compilers equally */ -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(__CW32__) -# pragma longlong on -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(__VC32__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T unsigned int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__MWERKS__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(_WIN32_WCE) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__MINGW32__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__VMS) -# if defined(__VAX) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T unsigned int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -#elif defined(__OS400__) -# if defined(__ILEC400__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 -# endif - -#elif defined(__MVS__) -# if defined(__IBMC__) || defined(__IBMCPP__) -# if defined(_ILP32) -# define CURL_SIZEOF_LONG 4 -# elif defined(_LP64) -# define CURL_SIZEOF_LONG 8 -# endif -# if defined(_LONG_LONG) -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(_LP64) -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 -# endif - -#elif defined(__370__) -# if defined(__IBMC__) || defined(__IBMCPP__) -# if defined(_ILP32) -# define CURL_SIZEOF_LONG 4 -# elif defined(_LP64) -# define CURL_SIZEOF_LONG 8 -# endif -# if defined(_LONG_LONG) -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(_LP64) -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# else -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 -# endif - -#elif defined(TPF) -# define CURL_SIZEOF_LONG 8 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -/* ===================================== */ -/* KEEP MSVC THE PENULTIMATE ENTRY */ -/* ===================================== */ - -#elif defined(_MSC_VER) -# if (_MSC_VER >= 900) && (_INTEGRAL_MAX_BITS >= 64) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T __int64 -# define CURL_FORMAT_CURL_OFF_T "I64d" -# define CURL_FORMAT_CURL_OFF_TU "I64u" -# define CURL_FORMAT_OFF_T "%I64d" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T i64 -# define CURL_SUFFIX_CURL_OFF_TU ui64 -# else -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 4 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T int -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 - -/* ===================================== */ -/* KEEP GENERIC GCC THE LAST ENTRY */ -/* ===================================== */ - -#elif defined(__GNUC__) -# if defined(__i386__) || defined(__ppc__) -# define CURL_SIZEOF_LONG 4 -# define CURL_TYPEOF_CURL_OFF_T long long -# define CURL_FORMAT_CURL_OFF_T "lld" -# define CURL_FORMAT_CURL_OFF_TU "llu" -# define CURL_FORMAT_OFF_T "%lld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T LL -# define CURL_SUFFIX_CURL_OFF_TU ULL -# elif defined(__x86_64__) || defined(__ppc64__) -# define CURL_SIZEOF_LONG 8 -# define CURL_TYPEOF_CURL_OFF_T long -# define CURL_FORMAT_CURL_OFF_T "ld" -# define CURL_FORMAT_CURL_OFF_TU "lu" -# define CURL_FORMAT_OFF_T "%ld" -# define CURL_SIZEOF_CURL_OFF_T 8 -# define CURL_SUFFIX_CURL_OFF_T L -# define CURL_SUFFIX_CURL_OFF_TU UL -# endif -# define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t -# define CURL_SIZEOF_CURL_SOCKLEN_T 4 -# define CURL_PULL_SYS_TYPES_H 1 -# define CURL_PULL_SYS_SOCKET_H 1 - -#else -# error "Unknown non-configure build target!" - Error Compilation_aborted_Unknown_non_configure_build_target -#endif - -/* CURL_PULL_SYS_TYPES_H is defined above when inclusion of header file */ -/* sys/types.h is required here to properly make type definitions below. */ -#ifdef CURL_PULL_SYS_TYPES_H -# include -#endif - -/* CURL_PULL_SYS_SOCKET_H is defined above when inclusion of header file */ -/* sys/socket.h is required here to properly make type definitions below. */ -#ifdef CURL_PULL_SYS_SOCKET_H -# include -#endif - -/* Data type definition of curl_socklen_t. */ - -#ifdef CURL_TYPEOF_CURL_SOCKLEN_T - typedef CURL_TYPEOF_CURL_SOCKLEN_T curl_socklen_t; -#endif - -/* Data type definition of curl_off_t. */ - -#ifdef CURL_TYPEOF_CURL_OFF_T - typedef CURL_TYPEOF_CURL_OFF_T curl_off_t; -#endif - -#endif /* __CURL_CURLBUILD_H */ diff --git a/common/curl/curlrules.h b/common/curl/curlrules.h deleted file mode 100644 index 8aad1df6..00000000 --- a/common/curl/curlrules.h +++ /dev/null @@ -1,252 +0,0 @@ -#ifndef __CURL_CURLRULES_H -#define __CURL_CURLRULES_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* ================================================================ */ -/* COMPILE TIME SANITY CHECKS */ -/* ================================================================ */ - -/* - * NOTE 1: - * ------- - * - * All checks done in this file are intentionally placed in a public - * header file which is pulled by curl/curl.h when an application is - * being built using an already built libcurl library. Additionally - * this file is also included and used when building the library. - * - * If compilation fails on this file it is certainly sure that the - * problem is elsewhere. It could be a problem in the curlbuild.h - * header file, or simply that you are using different compilation - * settings than those used to build the library. - * - * Nothing in this file is intended to be modified or adjusted by the - * curl library user nor by the curl library builder. - * - * Do not deactivate any check, these are done to make sure that the - * library is properly built and used. - * - * You can find further help on the libcurl development mailing list: - * http://cool.haxx.se/mailman/listinfo/curl-library/ - * - * NOTE 2 - * ------ - * - * Some of the following compile time checks are based on the fact - * that the dimension of a constant array can not be a negative one. - * In this way if the compile time verification fails, the compilation - * will fail issuing an error. The error description wording is compiler - * dependent but it will be quite similar to one of the following: - * - * "negative subscript or subscript is too large" - * "array must have at least one element" - * "-1 is an illegal array size" - * "size of array is negative" - * - * If you are building an application which tries to use an already - * built libcurl library and you are getting this kind of errors on - * this file, it is a clear indication that there is a mismatch between - * how the library was built and how you are trying to use it for your - * application. Your already compiled or binary library provider is the - * only one who can give you the details you need to properly use it. - */ - -/* - * Verify that some macros are actually defined. - */ - -#ifndef CURL_SIZEOF_LONG -# error "CURL_SIZEOF_LONG definition is missing!" - Error Compilation_aborted_CURL_SIZEOF_LONG_is_missing -#endif - -#ifndef CURL_TYPEOF_CURL_SOCKLEN_T -# error "CURL_TYPEOF_CURL_SOCKLEN_T definition is missing!" - Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_is_missing -#endif - -#ifndef CURL_SIZEOF_CURL_SOCKLEN_T -# error "CURL_SIZEOF_CURL_SOCKLEN_T definition is missing!" - Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_is_missing -#endif - -#ifndef CURL_TYPEOF_CURL_OFF_T -# error "CURL_TYPEOF_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_FORMAT_CURL_OFF_T -# error "CURL_FORMAT_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_FORMAT_CURL_OFF_TU -# error "CURL_FORMAT_CURL_OFF_TU definition is missing!" - Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_is_missing -#endif - -#ifndef CURL_FORMAT_OFF_T -# error "CURL_FORMAT_OFF_T definition is missing!" - Error Compilation_aborted_CURL_FORMAT_OFF_T_is_missing -#endif - -#ifndef CURL_SIZEOF_CURL_OFF_T -# error "CURL_SIZEOF_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_SUFFIX_CURL_OFF_T -# error "CURL_SUFFIX_CURL_OFF_T definition is missing!" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_is_missing -#endif - -#ifndef CURL_SUFFIX_CURL_OFF_TU -# error "CURL_SUFFIX_CURL_OFF_TU definition is missing!" - Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_is_missing -#endif - -/* - * Macros private to this header file. - */ - -#define CurlchkszEQ(t, s) sizeof(t) == s ? 1 : -1 - -#define CurlchkszGE(t1, t2) sizeof(t1) >= sizeof(t2) ? 1 : -1 - -/* - * Verify that the size previously defined and expected for long - * is the same as the one reported by sizeof() at compile time. - */ - -typedef char - __curl_rule_01__ - [CurlchkszEQ(long, CURL_SIZEOF_LONG)]; - -/* - * Verify that the size previously defined and expected for - * curl_off_t is actually the the same as the one reported - * by sizeof() at compile time. - */ - -typedef char - __curl_rule_02__ - [CurlchkszEQ(curl_off_t, CURL_SIZEOF_CURL_OFF_T)]; - -/* - * Verify at compile time that the size of curl_off_t as reported - * by sizeof() is greater or equal than the one reported for long - * for the current compilation. - */ - -typedef char - __curl_rule_03__ - [CurlchkszGE(curl_off_t, long)]; - -/* - * Verify that the size previously defined and expected for - * curl_socklen_t is actually the the same as the one reported - * by sizeof() at compile time. - */ - -typedef char - __curl_rule_04__ - [CurlchkszEQ(curl_socklen_t, CURL_SIZEOF_CURL_SOCKLEN_T)]; - -/* - * Verify at compile time that the size of curl_socklen_t as reported - * by sizeof() is greater or equal than the one reported for int for - * the current compilation. - */ - -typedef char - __curl_rule_05__ - [CurlchkszGE(curl_socklen_t, int)]; - -/* ================================================================ */ -/* EXTERNALLY AND INTERNALLY VISIBLE DEFINITIONS */ -/* ================================================================ */ - -/* - * CURL_ISOCPP and CURL_OFF_T_C definitions are done here in order to allow - * these to be visible and exported by the external libcurl interface API, - * while also making them visible to the library internals, simply including - * setup.h, without actually needing to include curl.h internally. - * If some day this section would grow big enough, all this should be moved - * to its own header file. - */ - -/* - * Figure out if we can use the ## preprocessor operator, which is supported - * by ISO/ANSI C and C++. Some compilers support it without setting __STDC__ - * or __cplusplus so we need to carefully check for them too. - */ - -#if defined(__STDC__) || defined(_MSC_VER) || defined(__cplusplus) || \ - defined(__HP_aCC) || defined(__BORLANDC__) || defined(__LCC__) || \ - defined(__POCC__) || defined(__SALFORDC__) || defined(__HIGHC__) || \ - defined(__ILEC400__) - /* This compiler is believed to have an ISO compatible preprocessor */ -#define CURL_ISOCPP -#else - /* This compiler is believed NOT to have an ISO compatible preprocessor */ -#undef CURL_ISOCPP -#endif - -/* - * Macros for minimum-width signed and unsigned curl_off_t integer constants. - */ - -#ifdef CURL_ISOCPP -# define __CURL_OFF_T_C_HELPER2(Val,Suffix) Val ## Suffix -#else -# define __CURL_OFF_T_C_HELPER2(Val,Suffix) Val/**/Suffix -#endif -#define __CURL_OFF_T_C_HELPER1(Val,Suffix) __CURL_OFF_T_C_HELPER2(Val,Suffix) -#define CURL_OFF_T_C(Val) __CURL_OFF_T_C_HELPER1(Val,CURL_SUFFIX_CURL_OFF_T) -#define CURL_OFF_TU_C(Val) __CURL_OFF_T_C_HELPER1(Val,CURL_SUFFIX_CURL_OFF_TU) - -/* - * Get rid of macros private to this header file. - */ - -#undef CurlchkszEQ -#undef CurlchkszGE - -/* - * Get rid of macros not intended to exist beyond this point. - */ - -#undef CURL_PULL_WS2TCPIP_H -#undef CURL_PULL_SYS_TYPES_H -#undef CURL_PULL_SYS_SOCKET_H -#undef CURL_PULL_STDINT_H -#undef CURL_PULL_INTTYPES_H - -#undef CURL_TYPEOF_CURL_SOCKLEN_T -#undef CURL_TYPEOF_CURL_OFF_T - -#ifdef CURL_NO_OLDIES -#undef CURL_FORMAT_OFF_T /* not required since 7.19.0 - obsoleted in 7.20.0 */ -#endif - -#endif /* __CURL_CURLRULES_H */ diff --git a/common/curl/curlver.h b/common/curl/curlver.h deleted file mode 100644 index e345f56d..00000000 --- a/common/curl/curlver.h +++ /dev/null @@ -1,69 +0,0 @@ -#ifndef __CURL_CURLVER_H -#define __CURL_CURLVER_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* This header file contains nothing but libcurl version info, generated by - a script at release-time. This was made its own header file in 7.11.2 */ - -/* This is the global package copyright */ -#define LIBCURL_COPYRIGHT "1996 - 2010 Daniel Stenberg, ." - -/* This is the version number of the libcurl package from which this header - file origins: */ -#define LIBCURL_VERSION "7.21.2" - -/* The numeric version number is also available "in parts" by using these - defines: */ -#define LIBCURL_VERSION_MAJOR 7 -#define LIBCURL_VERSION_MINOR 21 -#define LIBCURL_VERSION_PATCH 2 - -/* This is the numeric version of the libcurl version number, meant for easier - parsing and comparions by programs. The LIBCURL_VERSION_NUM define will - always follow this syntax: - - 0xXXYYZZ - - Where XX, YY and ZZ are the main version, release and patch numbers in - hexadecimal (using 8 bits each). All three numbers are always represented - using two digits. 1.2 would appear as "0x010200" while version 9.11.7 - appears as "0x090b07". - - This 6-digit (24 bits) hexadecimal number does not show pre-release number, - and it is always a greater number in a more recent release. It makes - comparisons with greater than and less than work. -*/ -#define LIBCURL_VERSION_NUM 0x071502 - -/* - * This is the date and time when the full source package was created. The - * timestamp is not stored in git, as the timestamp is properly set in the - * tarballs by the maketgz script. - * - * The format of the date should follow this template: - * - * "Mon Feb 12 11:35:33 UTC 2007" - */ -#define LIBCURL_TIMESTAMP "Tue Oct 12 22:03:31 UTC 2010" - -#endif /* __CURL_CURLVER_H */ diff --git a/common/curl/easy.h b/common/curl/easy.h deleted file mode 100644 index 1ddb4fe5..00000000 --- a/common/curl/easy.h +++ /dev/null @@ -1,102 +0,0 @@ -#ifndef __CURL_EASY_H -#define __CURL_EASY_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2008, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ -#ifdef __cplusplus -extern "C" { -#endif - -CURL_EXTERN CURL *curl_easy_init(void); -CURL_EXTERN CURLcode curl_easy_setopt(CURL *curl, CURLoption option, ...); -CURL_EXTERN CURLcode curl_easy_perform(CURL *curl); -CURL_EXTERN void curl_easy_cleanup(CURL *curl); - -/* - * NAME curl_easy_getinfo() - * - * DESCRIPTION - * - * Request internal information from the curl session with this function. The - * third argument MUST be a pointer to a long, a pointer to a char * or a - * pointer to a double (as the documentation describes elsewhere). The data - * pointed to will be filled in accordingly and can be relied upon only if the - * function returns CURLE_OK. This function is intended to get used *AFTER* a - * performed transfer, all results from this function are undefined until the - * transfer is completed. - */ -CURL_EXTERN CURLcode curl_easy_getinfo(CURL *curl, CURLINFO info, ...); - - -/* - * NAME curl_easy_duphandle() - * - * DESCRIPTION - * - * Creates a new curl session handle with the same options set for the handle - * passed in. Duplicating a handle could only be a matter of cloning data and - * options, internal state info and things like persistant connections cannot - * be transfered. It is useful in multithreaded applications when you can run - * curl_easy_duphandle() for each new thread to avoid a series of identical - * curl_easy_setopt() invokes in every thread. - */ -CURL_EXTERN CURL* curl_easy_duphandle(CURL *curl); - -/* - * NAME curl_easy_reset() - * - * DESCRIPTION - * - * Re-initializes a CURL handle to the default values. This puts back the - * handle to the same state as it was in when it was just created. - * - * It does keep: live connections, the Session ID cache, the DNS cache and the - * cookies. - */ -CURL_EXTERN void curl_easy_reset(CURL *curl); - -/* - * NAME curl_easy_recv() - * - * DESCRIPTION - * - * Receives data from the connected socket. Use after successful - * curl_easy_perform() with CURLOPT_CONNECT_ONLY option. - */ -CURL_EXTERN CURLcode curl_easy_recv(CURL *curl, void *buffer, size_t buflen, - size_t *n); - -/* - * NAME curl_easy_send() - * - * DESCRIPTION - * - * Sends data over the connected socket. Use after successful - * curl_easy_perform() with CURLOPT_CONNECT_ONLY option. - */ -CURL_EXTERN CURLcode curl_easy_send(CURL *curl, const void *buffer, - size_t buflen, size_t *n); - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/common/curl/mprintf.h b/common/curl/mprintf.h deleted file mode 100644 index de7dd2f3..00000000 --- a/common/curl/mprintf.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef __CURL_MPRINTF_H -#define __CURL_MPRINTF_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2006, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -#include -#include /* needed for FILE */ - -#include "curl.h" - -#ifdef __cplusplus -extern "C" { -#endif - -CURL_EXTERN int curl_mprintf(const char *format, ...); -CURL_EXTERN int curl_mfprintf(FILE *fd, const char *format, ...); -CURL_EXTERN int curl_msprintf(char *buffer, const char *format, ...); -CURL_EXTERN int curl_msnprintf(char *buffer, size_t maxlength, - const char *format, ...); -CURL_EXTERN int curl_mvprintf(const char *format, va_list args); -CURL_EXTERN int curl_mvfprintf(FILE *fd, const char *format, va_list args); -CURL_EXTERN int curl_mvsprintf(char *buffer, const char *format, va_list args); -CURL_EXTERN int curl_mvsnprintf(char *buffer, size_t maxlength, - const char *format, va_list args); -CURL_EXTERN char *curl_maprintf(const char *format, ...); -CURL_EXTERN char *curl_mvaprintf(const char *format, va_list args); - -#ifdef _MPRINTF_REPLACE -# undef printf -# undef fprintf -# undef sprintf -# undef vsprintf -# undef snprintf -# undef vprintf -# undef vfprintf -# undef vsnprintf -# undef aprintf -# undef vaprintf -# define printf curl_mprintf -# define fprintf curl_mfprintf -#ifdef CURLDEBUG -/* When built with CURLDEBUG we define away the sprintf() functions since we - don't want internal code to be using them */ -# define sprintf sprintf_was_used -# define vsprintf vsprintf_was_used -#else -# define sprintf curl_msprintf -# define vsprintf curl_mvsprintf -#endif -# define snprintf curl_msnprintf -# define vprintf curl_mvprintf -# define vfprintf curl_mvfprintf -# define vsnprintf curl_mvsnprintf -# define aprintf curl_maprintf -# define vaprintf curl_mvaprintf -#endif - -#ifdef __cplusplus -} -#endif - -#endif /* __CURL_MPRINTF_H */ diff --git a/common/curl/multi.h b/common/curl/multi.h deleted file mode 100644 index f9656666..00000000 --- a/common/curl/multi.h +++ /dev/null @@ -1,345 +0,0 @@ -#ifndef __CURL_MULTI_H -#define __CURL_MULTI_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2007, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ -/* - This is an "external" header file. Don't give away any internals here! - - GOALS - - o Enable a "pull" interface. The application that uses libcurl decides where - and when to ask libcurl to get/send data. - - o Enable multiple simultaneous transfers in the same thread without making it - complicated for the application. - - o Enable the application to select() on its own file descriptors and curl's - file descriptors simultaneous easily. - -*/ - -/* - * This header file should not really need to include "curl.h" since curl.h - * itself includes this file and we expect user applications to do #include - * without the need for especially including multi.h. - * - * For some reason we added this include here at one point, and rather than to - * break existing (wrongly written) libcurl applications, we leave it as-is - * but with this warning attached. - */ -#include "curl.h" - -#ifdef __cplusplus -extern "C" { -#endif - -typedef void CURLM; - -typedef enum { - CURLM_CALL_MULTI_PERFORM = -1, /* please call curl_multi_perform() or - curl_multi_socket*() soon */ - CURLM_OK, - CURLM_BAD_HANDLE, /* the passed-in handle is not a valid CURLM handle */ - CURLM_BAD_EASY_HANDLE, /* an easy handle was not good/valid */ - CURLM_OUT_OF_MEMORY, /* if you ever get this, you're in deep sh*t */ - CURLM_INTERNAL_ERROR, /* this is a libcurl bug */ - CURLM_BAD_SOCKET, /* the passed in socket argument did not match */ - CURLM_UNKNOWN_OPTION, /* curl_multi_setopt() with unsupported option */ - CURLM_LAST -} CURLMcode; - -/* just to make code nicer when using curl_multi_socket() you can now check - for CURLM_CALL_MULTI_SOCKET too in the same style it works for - curl_multi_perform() and CURLM_CALL_MULTI_PERFORM */ -#define CURLM_CALL_MULTI_SOCKET CURLM_CALL_MULTI_PERFORM - -typedef enum { - CURLMSG_NONE, /* first, not used */ - CURLMSG_DONE, /* This easy handle has completed. 'result' contains - the CURLcode of the transfer */ - CURLMSG_LAST /* last, not used */ -} CURLMSG; - -struct CURLMsg { - CURLMSG msg; /* what this message means */ - CURL *easy_handle; /* the handle it concerns */ - union { - void *whatever; /* message-specific data */ - CURLcode result; /* return code for transfer */ - } data; -}; -typedef struct CURLMsg CURLMsg; - -/* - * Name: curl_multi_init() - * - * Desc: inititalize multi-style curl usage - * - * Returns: a new CURLM handle to use in all 'curl_multi' functions. - */ -CURL_EXTERN CURLM *curl_multi_init(void); - -/* - * Name: curl_multi_add_handle() - * - * Desc: add a standard curl handle to the multi stack - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_add_handle(CURLM *multi_handle, - CURL *curl_handle); - - /* - * Name: curl_multi_remove_handle() - * - * Desc: removes a curl handle from the multi stack again - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_remove_handle(CURLM *multi_handle, - CURL *curl_handle); - - /* - * Name: curl_multi_fdset() - * - * Desc: Ask curl for its fd_set sets. The app can use these to select() or - * poll() on. We want curl_multi_perform() called as soon as one of - * them are ready. - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_fdset(CURLM *multi_handle, - fd_set *read_fd_set, - fd_set *write_fd_set, - fd_set *exc_fd_set, - int *max_fd); - - /* - * Name: curl_multi_perform() - * - * Desc: When the app thinks there's data available for curl it calls this - * function to read/write whatever there is right now. This returns - * as soon as the reads and writes are done. This function does not - * require that there actually is data available for reading or that - * data can be written, it can be called just in case. It returns - * the number of handles that still transfer data in the second - * argument's integer-pointer. - * - * Returns: CURLMcode type, general multi error code. *NOTE* that this only - * returns errors etc regarding the whole multi stack. There might - * still have occurred problems on invidual transfers even when this - * returns OK. - */ -CURL_EXTERN CURLMcode curl_multi_perform(CURLM *multi_handle, - int *running_handles); - - /* - * Name: curl_multi_cleanup() - * - * Desc: Cleans up and removes a whole multi stack. It does not free or - * touch any individual easy handles in any way. We need to define - * in what state those handles will be if this function is called - * in the middle of a transfer. - * - * Returns: CURLMcode type, general multi error code. - */ -CURL_EXTERN CURLMcode curl_multi_cleanup(CURLM *multi_handle); - -/* - * Name: curl_multi_info_read() - * - * Desc: Ask the multi handle if there's any messages/informationals from - * the individual transfers. Messages include informationals such as - * error code from the transfer or just the fact that a transfer is - * completed. More details on these should be written down as well. - * - * Repeated calls to this function will return a new struct each - * time, until a special "end of msgs" struct is returned as a signal - * that there is no more to get at this point. - * - * The data the returned pointer points to will not survive calling - * curl_multi_cleanup(). - * - * The 'CURLMsg' struct is meant to be very simple and only contain - * very basic informations. If more involved information is wanted, - * we will provide the particular "transfer handle" in that struct - * and that should/could/would be used in subsequent - * curl_easy_getinfo() calls (or similar). The point being that we - * must never expose complex structs to applications, as then we'll - * undoubtably get backwards compatibility problems in the future. - * - * Returns: A pointer to a filled-in struct, or NULL if it failed or ran out - * of structs. It also writes the number of messages left in the - * queue (after this read) in the integer the second argument points - * to. - */ -CURL_EXTERN CURLMsg *curl_multi_info_read(CURLM *multi_handle, - int *msgs_in_queue); - -/* - * Name: curl_multi_strerror() - * - * Desc: The curl_multi_strerror function may be used to turn a CURLMcode - * value into the equivalent human readable error string. This is - * useful for printing meaningful error messages. - * - * Returns: A pointer to a zero-terminated error message. - */ -CURL_EXTERN const char *curl_multi_strerror(CURLMcode); - -/* - * Name: curl_multi_socket() and - * curl_multi_socket_all() - * - * Desc: An alternative version of curl_multi_perform() that allows the - * application to pass in one of the file descriptors that have been - * detected to have "action" on them and let libcurl perform. - * See man page for details. - */ -#define CURL_POLL_NONE 0 -#define CURL_POLL_IN 1 -#define CURL_POLL_OUT 2 -#define CURL_POLL_INOUT 3 -#define CURL_POLL_REMOVE 4 - -#define CURL_SOCKET_TIMEOUT CURL_SOCKET_BAD - -#define CURL_CSELECT_IN 0x01 -#define CURL_CSELECT_OUT 0x02 -#define CURL_CSELECT_ERR 0x04 - -typedef int (*curl_socket_callback)(CURL *easy, /* easy handle */ - curl_socket_t s, /* socket */ - int what, /* see above */ - void *userp, /* private callback - pointer */ - void *socketp); /* private socket - pointer */ -/* - * Name: curl_multi_timer_callback - * - * Desc: Called by libcurl whenever the library detects a change in the - * maximum number of milliseconds the app is allowed to wait before - * curl_multi_socket() or curl_multi_perform() must be called - * (to allow libcurl's timed events to take place). - * - * Returns: The callback should return zero. - */ -typedef int (*curl_multi_timer_callback)(CURLM *multi, /* multi handle */ - long timeout_ms, /* see above */ - void *userp); /* private callback - pointer */ - -CURL_EXTERN CURLMcode curl_multi_socket(CURLM *multi_handle, curl_socket_t s, - int *running_handles); - -CURL_EXTERN CURLMcode curl_multi_socket_action(CURLM *multi_handle, - curl_socket_t s, - int ev_bitmask, - int *running_handles); - -CURL_EXTERN CURLMcode curl_multi_socket_all(CURLM *multi_handle, - int *running_handles); - -#ifndef CURL_ALLOW_OLD_MULTI_SOCKET -/* This macro below was added in 7.16.3 to push users who recompile to use - the new curl_multi_socket_action() instead of the old curl_multi_socket() -*/ -#define curl_multi_socket(x,y,z) curl_multi_socket_action(x,y,0,z) -#endif - -/* - * Name: curl_multi_timeout() - * - * Desc: Returns the maximum number of milliseconds the app is allowed to - * wait before curl_multi_socket() or curl_multi_perform() must be - * called (to allow libcurl's timed events to take place). - * - * Returns: CURLM error code. - */ -CURL_EXTERN CURLMcode curl_multi_timeout(CURLM *multi_handle, - long *milliseconds); - -#undef CINIT /* re-using the same name as in curl.h */ - -#ifdef CURL_ISOCPP -#define CINIT(name,type,num) CURLMOPT_ ## name = CURLOPTTYPE_ ## type + num -#else -/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */ -#define LONG CURLOPTTYPE_LONG -#define OBJECTPOINT CURLOPTTYPE_OBJECTPOINT -#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT -#define OFF_T CURLOPTTYPE_OFF_T -#define CINIT(name,type,number) CURLMOPT_/**/name = type + number -#endif - -typedef enum { - /* This is the socket callback function pointer */ - CINIT(SOCKETFUNCTION, FUNCTIONPOINT, 1), - - /* This is the argument passed to the socket callback */ - CINIT(SOCKETDATA, OBJECTPOINT, 2), - - /* set to 1 to enable pipelining for this multi handle */ - CINIT(PIPELINING, LONG, 3), - - /* This is the timer callback function pointer */ - CINIT(TIMERFUNCTION, FUNCTIONPOINT, 4), - - /* This is the argument passed to the timer callback */ - CINIT(TIMERDATA, OBJECTPOINT, 5), - - /* maximum number of entries in the connection cache */ - CINIT(MAXCONNECTS, LONG, 6), - - CURLMOPT_LASTENTRY /* the last unused */ -} CURLMoption; - - -/* - * Name: curl_multi_setopt() - * - * Desc: Sets options for the multi handle. - * - * Returns: CURLM error code. - */ -CURL_EXTERN CURLMcode curl_multi_setopt(CURLM *multi_handle, - CURLMoption option, ...); - - -/* - * Name: curl_multi_assign() - * - * Desc: This function sets an association in the multi handle between the - * given socket and a private pointer of the application. This is - * (only) useful for curl_multi_socket uses. - * - * Returns: CURLM error code. - */ -CURL_EXTERN CURLMcode curl_multi_assign(CURLM *multi_handle, - curl_socket_t sockfd, void *sockp); - -#ifdef __cplusplus -} /* end of extern "C" */ -#endif - -#endif diff --git a/common/curl/stdcheaders.h b/common/curl/stdcheaders.h deleted file mode 100644 index ad82ef63..00000000 --- a/common/curl/stdcheaders.h +++ /dev/null @@ -1,33 +0,0 @@ -#ifndef __STDC_HEADERS_H -#define __STDC_HEADERS_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -#include - -size_t fread (void *, size_t, size_t, FILE *); -size_t fwrite (const void *, size_t, size_t, FILE *); - -int strcasecmp(const char *, const char *); -int strncasecmp(const char *, const char *, size_t); - -#endif /* __STDC_HEADERS_H */ diff --git a/common/curl/typecheck-gcc.h b/common/curl/typecheck-gcc.h deleted file mode 100644 index e6f74a95..00000000 --- a/common/curl/typecheck-gcc.h +++ /dev/null @@ -1,584 +0,0 @@ -#ifndef __CURL_TYPECHECK_GCC_H -#define __CURL_TYPECHECK_GCC_H -/*************************************************************************** - * _ _ ____ _ - * Project ___| | | | _ \| | - * / __| | | | |_) | | - * | (__| |_| | _ <| |___ - * \___|\___/|_| \_\_____| - * - * Copyright (C) 1998 - 2010, Daniel Stenberg, , et al. - * - * This software is licensed as described in the file COPYING, which - * you should have received as part of this distribution. The terms - * are also available at http://curl.haxx.se/docs/copyright.html. - * - * You may opt to use, copy, modify, merge, publish, distribute and/or sell - * copies of the Software, and permit persons to whom the Software is - * furnished to do so, under the terms of the COPYING file. - * - * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY - * KIND, either express or implied. - * - ***************************************************************************/ - -/* wraps curl_easy_setopt() with typechecking */ - -/* To add a new kind of warning, add an - * if(_curl_is_sometype_option(_curl_opt)) - * if(!_curl_is_sometype(value)) - * _curl_easy_setopt_err_sometype(); - * block and define _curl_is_sometype_option, _curl_is_sometype and - * _curl_easy_setopt_err_sometype below - * - * NOTE: We use two nested 'if' statements here instead of the && operator, in - * order to work around gcc bug #32061. It affects only gcc 4.3.x/4.4.x - * when compiling with -Wlogical-op. - * - * To add an option that uses the same type as an existing option, you'll just - * need to extend the appropriate _curl_*_option macro - */ -#define curl_easy_setopt(handle, option, value) \ -__extension__ ({ \ - __typeof__ (option) _curl_opt = option; \ - if (__builtin_constant_p(_curl_opt)) { \ - if (_curl_is_long_option(_curl_opt)) \ - if (!_curl_is_long(value)) \ - _curl_easy_setopt_err_long(); \ - if (_curl_is_off_t_option(_curl_opt)) \ - if (!_curl_is_off_t(value)) \ - _curl_easy_setopt_err_curl_off_t(); \ - if (_curl_is_string_option(_curl_opt)) \ - if (!_curl_is_string(value)) \ - _curl_easy_setopt_err_string(); \ - if (_curl_is_write_cb_option(_curl_opt)) \ - if (!_curl_is_write_cb(value)) \ - _curl_easy_setopt_err_write_callback(); \ - if ((_curl_opt) == CURLOPT_READFUNCTION) \ - if (!_curl_is_read_cb(value)) \ - _curl_easy_setopt_err_read_cb(); \ - if ((_curl_opt) == CURLOPT_IOCTLFUNCTION) \ - if (!_curl_is_ioctl_cb(value)) \ - _curl_easy_setopt_err_ioctl_cb(); \ - if ((_curl_opt) == CURLOPT_SOCKOPTFUNCTION) \ - if (!_curl_is_sockopt_cb(value)) \ - _curl_easy_setopt_err_sockopt_cb(); \ - if ((_curl_opt) == CURLOPT_OPENSOCKETFUNCTION) \ - if (!_curl_is_opensocket_cb(value)) \ - _curl_easy_setopt_err_opensocket_cb(); \ - if ((_curl_opt) == CURLOPT_PROGRESSFUNCTION) \ - if (!_curl_is_progress_cb(value)) \ - _curl_easy_setopt_err_progress_cb(); \ - if ((_curl_opt) == CURLOPT_DEBUGFUNCTION) \ - if (!_curl_is_debug_cb(value)) \ - _curl_easy_setopt_err_debug_cb(); \ - if ((_curl_opt) == CURLOPT_SSL_CTX_FUNCTION) \ - if (!_curl_is_ssl_ctx_cb(value)) \ - _curl_easy_setopt_err_ssl_ctx_cb(); \ - if (_curl_is_conv_cb_option(_curl_opt)) \ - if (!_curl_is_conv_cb(value)) \ - _curl_easy_setopt_err_conv_cb(); \ - if ((_curl_opt) == CURLOPT_SEEKFUNCTION) \ - if (!_curl_is_seek_cb(value)) \ - _curl_easy_setopt_err_seek_cb(); \ - if (_curl_is_cb_data_option(_curl_opt)) \ - if (!_curl_is_cb_data(value)) \ - _curl_easy_setopt_err_cb_data(); \ - if ((_curl_opt) == CURLOPT_ERRORBUFFER) \ - if (!_curl_is_error_buffer(value)) \ - _curl_easy_setopt_err_error_buffer(); \ - if ((_curl_opt) == CURLOPT_STDERR) \ - if (!_curl_is_FILE(value)) \ - _curl_easy_setopt_err_FILE(); \ - if (_curl_is_postfields_option(_curl_opt)) \ - if (!_curl_is_postfields(value)) \ - _curl_easy_setopt_err_postfields(); \ - if ((_curl_opt) == CURLOPT_HTTPPOST) \ - if (!_curl_is_arr((value), struct curl_httppost)) \ - _curl_easy_setopt_err_curl_httpost(); \ - if (_curl_is_slist_option(_curl_opt)) \ - if (!_curl_is_arr((value), struct curl_slist)) \ - _curl_easy_setopt_err_curl_slist(); \ - if ((_curl_opt) == CURLOPT_SHARE) \ - if (!_curl_is_ptr((value), CURLSH)) \ - _curl_easy_setopt_err_CURLSH(); \ - } \ - curl_easy_setopt(handle, _curl_opt, value); \ -}) - -/* wraps curl_easy_getinfo() with typechecking */ -/* FIXME: don't allow const pointers */ -#define curl_easy_getinfo(handle, info, arg) \ -__extension__ ({ \ - __typeof__ (info) _curl_info = info; \ - if (__builtin_constant_p(_curl_info)) { \ - if (_curl_is_string_info(_curl_info)) \ - if (!_curl_is_arr((arg), char *)) \ - _curl_easy_getinfo_err_string(); \ - if (_curl_is_long_info(_curl_info)) \ - if (!_curl_is_arr((arg), long)) \ - _curl_easy_getinfo_err_long(); \ - if (_curl_is_double_info(_curl_info)) \ - if (!_curl_is_arr((arg), double)) \ - _curl_easy_getinfo_err_double(); \ - if (_curl_is_slist_info(_curl_info)) \ - if (!_curl_is_arr((arg), struct curl_slist *)) \ - _curl_easy_getinfo_err_curl_slist(); \ - } \ - curl_easy_getinfo(handle, _curl_info, arg); \ -}) - -/* TODO: typechecking for curl_share_setopt() and curl_multi_setopt(), - * for now just make sure that the functions are called with three - * arguments - */ -#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param) -#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param) - - -/* the actual warnings, triggered by calling the _curl_easy_setopt_err* - * functions */ - -/* To define a new warning, use _CURL_WARNING(identifier, "message") */ -#define _CURL_WARNING(id, message) \ - static void __attribute__((warning(message))) __attribute__((unused)) \ - __attribute__((noinline)) id(void) { __asm__(""); } - -_CURL_WARNING(_curl_easy_setopt_err_long, - "curl_easy_setopt expects a long argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_curl_off_t, - "curl_easy_setopt expects a curl_off_t argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_string, - "curl_easy_setopt expects a string (char* or char[]) argument for this option" - ) -_CURL_WARNING(_curl_easy_setopt_err_write_callback, - "curl_easy_setopt expects a curl_write_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_read_cb, - "curl_easy_setopt expects a curl_read_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_ioctl_cb, - "curl_easy_setopt expects a curl_ioctl_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_sockopt_cb, - "curl_easy_setopt expects a curl_sockopt_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_opensocket_cb, - "curl_easy_setopt expects a curl_opensocket_callback argument for this option" - ) -_CURL_WARNING(_curl_easy_setopt_err_progress_cb, - "curl_easy_setopt expects a curl_progress_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_debug_cb, - "curl_easy_setopt expects a curl_debug_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_ssl_ctx_cb, - "curl_easy_setopt expects a curl_ssl_ctx_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_conv_cb, - "curl_easy_setopt expects a curl_conv_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_seek_cb, - "curl_easy_setopt expects a curl_seek_callback argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_cb_data, - "curl_easy_setopt expects a private data pointer as argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_error_buffer, - "curl_easy_setopt expects a char buffer of CURL_ERROR_SIZE as argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_FILE, - "curl_easy_setopt expects a FILE* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_postfields, - "curl_easy_setopt expects a void* or char* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_curl_httpost, - "curl_easy_setopt expects a struct curl_httppost* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_curl_slist, - "curl_easy_setopt expects a struct curl_slist* argument for this option") -_CURL_WARNING(_curl_easy_setopt_err_CURLSH, - "curl_easy_setopt expects a CURLSH* argument for this option") - -_CURL_WARNING(_curl_easy_getinfo_err_string, - "curl_easy_getinfo expects a pointer to char * for this info") -_CURL_WARNING(_curl_easy_getinfo_err_long, - "curl_easy_getinfo expects a pointer to long for this info") -_CURL_WARNING(_curl_easy_getinfo_err_double, - "curl_easy_getinfo expects a pointer to double for this info") -_CURL_WARNING(_curl_easy_getinfo_err_curl_slist, - "curl_easy_getinfo expects a pointer to struct curl_slist * for this info") - -/* groups of curl_easy_setops options that take the same type of argument */ - -/* To add a new option to one of the groups, just add - * (option) == CURLOPT_SOMETHING - * to the or-expression. If the option takes a long or curl_off_t, you don't - * have to do anything - */ - -/* evaluates to true if option takes a long argument */ -#define _curl_is_long_option(option) \ - (0 < (option) && (option) < CURLOPTTYPE_OBJECTPOINT) - -#define _curl_is_off_t_option(option) \ - ((option) > CURLOPTTYPE_OFF_T) - -/* evaluates to true if option takes a char* argument */ -#define _curl_is_string_option(option) \ - ((option) == CURLOPT_URL || \ - (option) == CURLOPT_PROXY || \ - (option) == CURLOPT_INTERFACE || \ - (option) == CURLOPT_NETRC_FILE || \ - (option) == CURLOPT_USERPWD || \ - (option) == CURLOPT_USERNAME || \ - (option) == CURLOPT_PASSWORD || \ - (option) == CURLOPT_PROXYUSERPWD || \ - (option) == CURLOPT_PROXYUSERNAME || \ - (option) == CURLOPT_PROXYPASSWORD || \ - (option) == CURLOPT_NOPROXY || \ - (option) == CURLOPT_ENCODING || \ - (option) == CURLOPT_REFERER || \ - (option) == CURLOPT_USERAGENT || \ - (option) == CURLOPT_COOKIE || \ - (option) == CURLOPT_COOKIEFILE || \ - (option) == CURLOPT_COOKIEJAR || \ - (option) == CURLOPT_COOKIELIST || \ - (option) == CURLOPT_FTPPORT || \ - (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER || \ - (option) == CURLOPT_FTP_ACCOUNT || \ - (option) == CURLOPT_RANGE || \ - (option) == CURLOPT_CUSTOMREQUEST || \ - (option) == CURLOPT_SSLCERT || \ - (option) == CURLOPT_SSLCERTTYPE || \ - (option) == CURLOPT_SSLKEY || \ - (option) == CURLOPT_SSLKEYTYPE || \ - (option) == CURLOPT_KEYPASSWD || \ - (option) == CURLOPT_SSLENGINE || \ - (option) == CURLOPT_CAINFO || \ - (option) == CURLOPT_CAPATH || \ - (option) == CURLOPT_RANDOM_FILE || \ - (option) == CURLOPT_EGDSOCKET || \ - (option) == CURLOPT_SSL_CIPHER_LIST || \ - (option) == CURLOPT_KRBLEVEL || \ - (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 || \ - (option) == CURLOPT_SSH_PUBLIC_KEYFILE || \ - (option) == CURLOPT_SSH_PRIVATE_KEYFILE || \ - (option) == CURLOPT_CRLFILE || \ - (option) == CURLOPT_ISSUERCERT || \ - (option) == CURLOPT_SOCKS5_GSSAPI_SERVICE || \ - (option) == CURLOPT_SSH_KNOWNHOSTS || \ - (option) == CURLOPT_MAIL_FROM || \ - (option) == CURLOPT_RTSP_SESSION_ID || \ - (option) == CURLOPT_RTSP_STREAM_URI || \ - (option) == CURLOPT_RTSP_TRANSPORT || \ - 0) - -/* evaluates to true if option takes a curl_write_callback argument */ -#define _curl_is_write_cb_option(option) \ - ((option) == CURLOPT_HEADERFUNCTION || \ - (option) == CURLOPT_WRITEFUNCTION) - -/* evaluates to true if option takes a curl_conv_callback argument */ -#define _curl_is_conv_cb_option(option) \ - ((option) == CURLOPT_CONV_TO_NETWORK_FUNCTION || \ - (option) == CURLOPT_CONV_FROM_NETWORK_FUNCTION || \ - (option) == CURLOPT_CONV_FROM_UTF8_FUNCTION) - -/* evaluates to true if option takes a data argument to pass to a callback */ -#define _curl_is_cb_data_option(option) \ - ((option) == CURLOPT_WRITEDATA || \ - (option) == CURLOPT_READDATA || \ - (option) == CURLOPT_IOCTLDATA || \ - (option) == CURLOPT_SOCKOPTDATA || \ - (option) == CURLOPT_OPENSOCKETDATA || \ - (option) == CURLOPT_PROGRESSDATA || \ - (option) == CURLOPT_WRITEHEADER || \ - (option) == CURLOPT_DEBUGDATA || \ - (option) == CURLOPT_SSL_CTX_DATA || \ - (option) == CURLOPT_SEEKDATA || \ - (option) == CURLOPT_PRIVATE || \ - (option) == CURLOPT_SSH_KEYDATA || \ - (option) == CURLOPT_INTERLEAVEDATA || \ - (option) == CURLOPT_CHUNK_DATA || \ - (option) == CURLOPT_FNMATCH_DATA || \ - 0) - -/* evaluates to true if option takes a POST data argument (void* or char*) */ -#define _curl_is_postfields_option(option) \ - ((option) == CURLOPT_POSTFIELDS || \ - (option) == CURLOPT_COPYPOSTFIELDS || \ - 0) - -/* evaluates to true if option takes a struct curl_slist * argument */ -#define _curl_is_slist_option(option) \ - ((option) == CURLOPT_HTTPHEADER || \ - (option) == CURLOPT_HTTP200ALIASES || \ - (option) == CURLOPT_QUOTE || \ - (option) == CURLOPT_POSTQUOTE || \ - (option) == CURLOPT_PREQUOTE || \ - (option) == CURLOPT_TELNETOPTIONS || \ - (option) == CURLOPT_MAIL_RCPT || \ - 0) - -/* groups of curl_easy_getinfo infos that take the same type of argument */ - -/* evaluates to true if info expects a pointer to char * argument */ -#define _curl_is_string_info(info) \ - (CURLINFO_STRING < (info) && (info) < CURLINFO_LONG) - -/* evaluates to true if info expects a pointer to long argument */ -#define _curl_is_long_info(info) \ - (CURLINFO_LONG < (info) && (info) < CURLINFO_DOUBLE) - -/* evaluates to true if info expects a pointer to double argument */ -#define _curl_is_double_info(info) \ - (CURLINFO_DOUBLE < (info) && (info) < CURLINFO_SLIST) - -/* true if info expects a pointer to struct curl_slist * argument */ -#define _curl_is_slist_info(info) \ - (CURLINFO_SLIST < (info)) - - -/* typecheck helpers -- check whether given expression has requested type*/ - -/* For pointers, you can use the _curl_is_ptr/_curl_is_arr macros, - * otherwise define a new macro. Search for __builtin_types_compatible_p - * in the GCC manual. - * NOTE: these macros MUST NOT EVALUATE their arguments! The argument is - * the actual expression passed to the curl_easy_setopt macro. This - * means that you can only apply the sizeof and __typeof__ operators, no - * == or whatsoever. - */ - -/* XXX: should evaluate to true iff expr is a pointer */ -#define _curl_is_any_ptr(expr) \ - (sizeof(expr) == sizeof(void*)) - -/* evaluates to true if expr is NULL */ -/* XXX: must not evaluate expr, so this check is not accurate */ -#define _curl_is_NULL(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), __typeof__(NULL))) - -/* evaluates to true if expr is type*, const type* or NULL */ -#define _curl_is_ptr(expr, type) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), type *) || \ - __builtin_types_compatible_p(__typeof__(expr), const type *)) - -/* evaluates to true if expr is one of type[], type*, NULL or const type* */ -#define _curl_is_arr(expr, type) \ - (_curl_is_ptr((expr), type) || \ - __builtin_types_compatible_p(__typeof__(expr), type [])) - -/* evaluates to true if expr is a string */ -#define _curl_is_string(expr) \ - (_curl_is_arr((expr), char) || \ - _curl_is_arr((expr), signed char) || \ - _curl_is_arr((expr), unsigned char)) - -/* evaluates to true if expr is a long (no matter the signedness) - * XXX: for now, int is also accepted (and therefore short and char, which - * are promoted to int when passed to a variadic function) */ -#define _curl_is_long(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), long) || \ - __builtin_types_compatible_p(__typeof__(expr), signed long) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned long) || \ - __builtin_types_compatible_p(__typeof__(expr), int) || \ - __builtin_types_compatible_p(__typeof__(expr), signed int) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned int) || \ - __builtin_types_compatible_p(__typeof__(expr), short) || \ - __builtin_types_compatible_p(__typeof__(expr), signed short) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned short) || \ - __builtin_types_compatible_p(__typeof__(expr), char) || \ - __builtin_types_compatible_p(__typeof__(expr), signed char) || \ - __builtin_types_compatible_p(__typeof__(expr), unsigned char)) - -/* evaluates to true if expr is of type curl_off_t */ -#define _curl_is_off_t(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), curl_off_t)) - -/* evaluates to true if expr is abuffer suitable for CURLOPT_ERRORBUFFER */ -/* XXX: also check size of an char[] array? */ -#define _curl_is_error_buffer(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), char *) || \ - __builtin_types_compatible_p(__typeof__(expr), char[])) - -/* evaluates to true if expr is of type (const) void* or (const) FILE* */ -#if 0 -#define _curl_is_cb_data(expr) \ - (_curl_is_ptr((expr), void) || \ - _curl_is_ptr((expr), FILE)) -#else /* be less strict */ -#define _curl_is_cb_data(expr) \ - _curl_is_any_ptr(expr) -#endif - -/* evaluates to true if expr is of type FILE* */ -#define _curl_is_FILE(expr) \ - (__builtin_types_compatible_p(__typeof__(expr), FILE *)) - -/* evaluates to true if expr can be passed as POST data (void* or char*) */ -#define _curl_is_postfields(expr) \ - (_curl_is_ptr((expr), void) || \ - _curl_is_arr((expr), char)) - -/* FIXME: the whole callback checking is messy... - * The idea is to tolerate char vs. void and const vs. not const - * pointers in arguments at least - */ -/* helper: __builtin_types_compatible_p distinguishes between functions and - * function pointers, hide it */ -#define _curl_callback_compatible(func, type) \ - (__builtin_types_compatible_p(__typeof__(func), type) || \ - __builtin_types_compatible_p(__typeof__(func), type*)) - -/* evaluates to true if expr is of type curl_read_callback or "similar" */ -#define _curl_is_read_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), __typeof__(fread)) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_read_callback) || \ - _curl_callback_compatible((expr), _curl_read_callback1) || \ - _curl_callback_compatible((expr), _curl_read_callback2) || \ - _curl_callback_compatible((expr), _curl_read_callback3) || \ - _curl_callback_compatible((expr), _curl_read_callback4) || \ - _curl_callback_compatible((expr), _curl_read_callback5) || \ - _curl_callback_compatible((expr), _curl_read_callback6)) -typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void*); -typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void*); -typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE*); -typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void*); -typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void*); -typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*); - -/* evaluates to true if expr is of type curl_write_callback or "similar" */ -#define _curl_is_write_cb(expr) \ - (_curl_is_read_cb(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), __typeof__(fwrite)) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_write_callback) || \ - _curl_callback_compatible((expr), _curl_write_callback1) || \ - _curl_callback_compatible((expr), _curl_write_callback2) || \ - _curl_callback_compatible((expr), _curl_write_callback3) || \ - _curl_callback_compatible((expr), _curl_write_callback4) || \ - _curl_callback_compatible((expr), _curl_write_callback5) || \ - _curl_callback_compatible((expr), _curl_write_callback6)) -typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void*); -typedef size_t (_curl_write_callback2)(const char *, size_t, size_t, - const void*); -typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE*); -typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void*); -typedef size_t (_curl_write_callback5)(const void *, size_t, size_t, - const void*); -typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*); - -/* evaluates to true if expr is of type curl_ioctl_callback or "similar" */ -#define _curl_is_ioctl_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_ioctl_callback) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback1) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback2) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback3) || \ - _curl_callback_compatible((expr), _curl_ioctl_callback4)) -typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void*); -typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void*); -typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void*); -typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void*); - -/* evaluates to true if expr is of type curl_sockopt_callback or "similar" */ -#define _curl_is_sockopt_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_sockopt_callback) || \ - _curl_callback_compatible((expr), _curl_sockopt_callback1) || \ - _curl_callback_compatible((expr), _curl_sockopt_callback2)) -typedef int (_curl_sockopt_callback1)(void *, curl_socket_t, curlsocktype); -typedef int (_curl_sockopt_callback2)(const void *, curl_socket_t, - curlsocktype); - -/* evaluates to true if expr is of type curl_opensocket_callback or "similar" */ -#define _curl_is_opensocket_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_opensocket_callback) ||\ - _curl_callback_compatible((expr), _curl_opensocket_callback1) || \ - _curl_callback_compatible((expr), _curl_opensocket_callback2) || \ - _curl_callback_compatible((expr), _curl_opensocket_callback3) || \ - _curl_callback_compatible((expr), _curl_opensocket_callback4)) -typedef curl_socket_t (_curl_opensocket_callback1) - (void *, curlsocktype, struct curl_sockaddr *); -typedef curl_socket_t (_curl_opensocket_callback2) - (void *, curlsocktype, const struct curl_sockaddr *); -typedef curl_socket_t (_curl_opensocket_callback3) - (const void *, curlsocktype, struct curl_sockaddr *); -typedef curl_socket_t (_curl_opensocket_callback4) - (const void *, curlsocktype, const struct curl_sockaddr *); - -/* evaluates to true if expr is of type curl_progress_callback or "similar" */ -#define _curl_is_progress_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_progress_callback) || \ - _curl_callback_compatible((expr), _curl_progress_callback1) || \ - _curl_callback_compatible((expr), _curl_progress_callback2)) -typedef int (_curl_progress_callback1)(void *, - double, double, double, double); -typedef int (_curl_progress_callback2)(const void *, - double, double, double, double); - -/* evaluates to true if expr is of type curl_debug_callback or "similar" */ -#define _curl_is_debug_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_debug_callback) || \ - _curl_callback_compatible((expr), _curl_debug_callback1) || \ - _curl_callback_compatible((expr), _curl_debug_callback2) || \ - _curl_callback_compatible((expr), _curl_debug_callback3) || \ - _curl_callback_compatible((expr), _curl_debug_callback4)) -typedef int (_curl_debug_callback1) (CURL *, - curl_infotype, char *, size_t, void *); -typedef int (_curl_debug_callback2) (CURL *, - curl_infotype, char *, size_t, const void *); -typedef int (_curl_debug_callback3) (CURL *, - curl_infotype, const char *, size_t, void *); -typedef int (_curl_debug_callback4) (CURL *, - curl_infotype, const char *, size_t, const void *); - -/* evaluates to true if expr is of type curl_ssl_ctx_callback or "similar" */ -/* this is getting even messier... */ -#define _curl_is_ssl_ctx_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_ssl_ctx_callback) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback1) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback2) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback3) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback4) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback5) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback6) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback7) || \ - _curl_callback_compatible((expr), _curl_ssl_ctx_callback8)) -typedef CURLcode (_curl_ssl_ctx_callback1)(CURL *, void *, void *); -typedef CURLcode (_curl_ssl_ctx_callback2)(CURL *, void *, const void *); -typedef CURLcode (_curl_ssl_ctx_callback3)(CURL *, const void *, void *); -typedef CURLcode (_curl_ssl_ctx_callback4)(CURL *, const void *, const void *); -#ifdef HEADER_SSL_H -/* hack: if we included OpenSSL's ssl.h, we know about SSL_CTX - * this will of course break if we're included before OpenSSL headers... - */ -typedef CURLcode (_curl_ssl_ctx_callback5)(CURL *, SSL_CTX, void *); -typedef CURLcode (_curl_ssl_ctx_callback6)(CURL *, SSL_CTX, const void *); -typedef CURLcode (_curl_ssl_ctx_callback7)(CURL *, const SSL_CTX, void *); -typedef CURLcode (_curl_ssl_ctx_callback8)(CURL *, const SSL_CTX, const void *); -#else -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback5; -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback6; -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback7; -typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback8; -#endif - -/* evaluates to true if expr is of type curl_conv_callback or "similar" */ -#define _curl_is_conv_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_conv_callback) || \ - _curl_callback_compatible((expr), _curl_conv_callback1) || \ - _curl_callback_compatible((expr), _curl_conv_callback2) || \ - _curl_callback_compatible((expr), _curl_conv_callback3) || \ - _curl_callback_compatible((expr), _curl_conv_callback4)) -typedef CURLcode (*_curl_conv_callback1)(char *, size_t length); -typedef CURLcode (*_curl_conv_callback2)(const char *, size_t length); -typedef CURLcode (*_curl_conv_callback3)(void *, size_t length); -typedef CURLcode (*_curl_conv_callback4)(const void *, size_t length); - -/* evaluates to true if expr is of type curl_seek_callback or "similar" */ -#define _curl_is_seek_cb(expr) \ - (_curl_is_NULL(expr) || \ - __builtin_types_compatible_p(__typeof__(expr), curl_seek_callback) || \ - _curl_callback_compatible((expr), _curl_seek_callback1) || \ - _curl_callback_compatible((expr), _curl_seek_callback2)) -typedef CURLcode (*_curl_seek_callback1)(void *, curl_off_t, int); -typedef CURLcode (*_curl_seek_callback2)(const void *, curl_off_t, int); - - -#endif /* __CURL_TYPECHECK_GCC_H */ diff --git a/common/curl/types.h b/common/curl/types.h deleted file mode 100644 index d37d6ae9..00000000 --- a/common/curl/types.h +++ /dev/null @@ -1 +0,0 @@ -/* not used */ diff --git a/common/sse2neon.h b/common/sse2neon.h new file mode 100644 index 00000000..9e512acf --- /dev/null +++ b/common/sse2neon.h @@ -0,0 +1,7598 @@ +#ifndef SSE2NEON_H +#define SSE2NEON_H + +// This header file provides a simple API translation layer +// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions +// +// This header file does not yet translate all of the SSE intrinsics. +// +// Contributors to this work are: +// John W. Ratcliff +// Brandon Rowlett +// Ken Fast +// Eric van Beurden +// Alexander Potylitsin +// Hasindu Gamaarachchi +// Jim Huang +// Mark Cheng +// Malcolm James MacLeod +// Devin Hussey (easyaspi314) +// Sebastian Pop +// Developer Ecosystem Engineering +// Danila Kutenin +// François Turban (JishinMaster) +// Pei-Hsuan Hung +// Yang-Hao Yuan +// Syoyo Fujita +// Brecht Van Lommel + +/* + * sse2neon is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* Tunable configurations */ + +/* Enable precise implementation of math operations + * This would slow down the computation a bit, but gives consistent result with + * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) + */ +/* _mm_min_ps and _mm_max_ps */ +#ifndef SSE2NEON_PRECISE_MINMAX +#define SSE2NEON_PRECISE_MINMAX (0) +#endif +/* _mm_rcp_ps and _mm_div_ps */ +#ifndef SSE2NEON_PRECISE_DIV +#define SSE2NEON_PRECISE_DIV (0) +#endif +/* _mm_sqrt_ps and _mm_rsqrt_ps */ +#ifndef SSE2NEON_PRECISE_SQRT +#define SSE2NEON_PRECISE_SQRT (0) +#endif + +#if defined(__GNUC__) || defined(__clang__) +#pragma push_macro("FORCE_INLINE") +#pragma push_macro("ALIGN_STRUCT") +#define FORCE_INLINE static inline __attribute__((always_inline)) +#define ALIGN_STRUCT(x) __attribute__((aligned(x))) +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#else +#error "Macro name collisions may happen with unsupported compiler." +#ifdef FORCE_INLINE +#undef FORCE_INLINE +#endif +#define FORCE_INLINE static inline +#ifndef ALIGN_STRUCT +#define ALIGN_STRUCT(x) __declspec(align(x)) +#endif +#endif +#ifndef likely +#define likely(x) (x) +#endif +#ifndef unlikely +#define unlikely(x) (x) +#endif + +#include +#include + +/* Architecture-specific build options */ +/* FIXME: #pragma GCC push_options is only available on GCC */ +#if defined(__GNUC__) +#if defined(__arm__) && __ARM_ARCH == 7 +/* According to ARM C Language Extensions Architecture specification, + * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) + * architecture supported. + */ +#if !defined(__ARM_NEON) || !defined(__ARM_NEON__) +#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." +#endif +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("fpu=neon") +#endif +#elif defined(__aarch64__) +#if !defined(__clang__) +#pragma GCC push_options +#pragma GCC target("+simd") +#endif +#else +#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." +#endif +#endif + +#include + +/* Rounding functions require either Aarch64 instructions or libm failback */ +#if !defined(__aarch64__) +#include +#endif + +/* "__has_builtin" can be used to query support for built-in functions + * provided by gcc/clang and other compilers that support it. + */ +#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ +/* Compatibility with gcc <= 9 */ +#if __GNUC__ <= 9 +#define __has_builtin(x) HAS##x +#define HAS__builtin_popcount 1 +#define HAS__builtin_popcountll 1 +#else +#define __has_builtin(x) 0 +#endif +#endif + +/** + * MACRO for shuffle parameter for _mm_shuffle_ps(). + * Argument fp3 is a digit[0123] that represents the fp from argument "b" + * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same + * for fp2 in result. fp1 is a digit[0123] that represents the fp from + * argument "a" of mm_shuffle_ps that will be places in fp1 of result. + * fp0 is the same for fp0 of result. + */ +#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 +#define _MM_FROUND_NO_EXC 0x08 +#define _MM_ROUND_NEAREST 0x0000 +#define _MM_ROUND_DOWN 0x2000 +#define _MM_ROUND_UP 0x4000 +#define _MM_ROUND_TOWARD_ZERO 0x6000 + +/* indicate immediate constant argument in a given range */ +#define __constrange(a, b) const + +/* A few intrinsics accept traditional data types like ints or floats, but + * most operate on data types that are specific to SSE. + * If a vector type ends in d, it contains doubles, and if it does not have + * a suffix, it contains floats. An integer vector type can contain any type + * of integer, from chars to shorts to unsigned long longs. + */ +typedef int64x1_t __m64; +typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ +// On ARM 32-bit architecture, the float64x2_t is not supported. +// The data type __m128d should be represented in a different way for related +// intrinsic conversion. +#if defined(__aarch64__) +typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ +#else +typedef float32x4_t __m128d; +#endif +typedef int64x2_t __m128i; /* 128-bit vector containing integers */ + +/* type-safe casting between types */ + +#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) +#define vreinterpretq_m128_f32(x) (x) +#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) + +#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) +#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) +#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) +#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) +#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) +#define vreinterpretq_f32_m128(x) (x) +#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) + +#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) +#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) +#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) +#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) +#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) +#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) +#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) +#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) +#define vreinterpretq_m128i_s64(x) (x) + +#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) +#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) +#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) +#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) + +#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) +#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) +#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) +#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) +#define vreinterpretq_s64_m128i(x) (x) + +#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) +#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) +#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) +#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) + +#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) +#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) +#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) +#define vreinterpret_m64_s64(x) (x) + +#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) +#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) +#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) +#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) + +#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) +#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) +#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) + +#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) +#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) +#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) +#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) + +#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) +#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) +#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) +#define vreinterpret_s64_m64(x) (x) + +#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) + +#if defined(__aarch64__) +#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) + +#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) + +#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) +#define vreinterpretq_m128d_f64(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) + +#define vreinterpretq_f64_m128d(x) (x) +#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) +#else +#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) +#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) + +#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) +#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) + +#define vreinterpretq_m128d_f32(x) (x) + +#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) + +#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) +#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) + +#define vreinterpretq_f32_m128d(x) (x) +#endif + +// A struct is defined in this header file called 'SIMDVec' which can be used +// by applications which attempt to access the contents of an _m128 struct +// directly. It is important to note that accessing the __m128 struct directly +// is bad coding practice by Microsoft: @see: +// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx +// +// However, some legacy source code may try to access the contents of an __m128 +// struct directly so the developer can use the SIMDVec as an alias for it. Any +// casting must be done manually by the developer, as you cannot cast or +// otherwise alias the base NEON data type for intrinsic operations. +// +// union intended to allow direct access to an __m128 variable using the names +// that the MSVC compiler provides. This union should really only be used when +// trying to access the members of the vector as integer values. GCC/clang +// allow native access to the float members through a simple array access +// operator (in C since 4.6, in C++ since 4.8). +// +// Ideally direct accesses to SIMD vectors should not be used since it can cause +// a performance hit. If it really is needed however, the original __m128 +// variable can be aliased with a pointer to this union and used to access +// individual components. The use of this union should be hidden behind a macro +// that is used throughout the codebase to access the members instead of always +// declaring this type of variable. +typedef union ALIGN_STRUCT(16) SIMDVec { + float m128_f32[4]; // as floats - DON'T USE. Added for convenience. + int8_t m128_i8[16]; // as signed 8-bit integers. + int16_t m128_i16[8]; // as signed 16-bit integers. + int32_t m128_i32[4]; // as signed 32-bit integers. + int64_t m128_i64[2]; // as signed 64-bit integers. + uint8_t m128_u8[16]; // as unsigned 8-bit integers. + uint16_t m128_u16[8]; // as unsigned 16-bit integers. + uint32_t m128_u32[4]; // as unsigned 32-bit integers. + uint64_t m128_u64[2]; // as unsigned 64-bit integers. +} SIMDVec; + +// casting using SIMDVec +#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) +#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) +#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) + +/* Backwards compatibility for compilers with lack of specific type support */ + +// Older gcc does not define vld1q_u8_x4 type +#if defined(__GNUC__) && !defined(__clang__) && \ + ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \ + (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \ + (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7) +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + uint8x16x4_t ret; + ret.val[0] = vld1q_u8(p + 0); + ret.val[1] = vld1q_u8(p + 16); + ret.val[2] = vld1q_u8(p + 32); + ret.val[3] = vld1q_u8(p + 48); + return ret; +} +#else +// Wraps vld1q_u8_x4 +FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) +{ + return vld1q_u8_x4(p); +} +#endif + +/* Function Naming Conventions + * The naming convention of SSE intrinsics is straightforward. A generic SSE + * intrinsic function is given as follows: + * _mm__ + * + * The parts of this format are given as follows: + * 1. describes the operation performed by the intrinsic + * 2. identifies the data type of the function's primary arguments + * + * This last part, , is a little complicated. It identifies the + * content of the input values, and can be set to any of the following values: + * + ps - vectors contain floats (ps stands for packed single-precision) + * + pd - vectors cantain doubles (pd stands for packed double-precision) + * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * signed integers + * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit + * unsigned integers + * + si128 - unspecified 128-bit vector or 256-bit vector + * + m128/m128i/m128d - identifies input vector types when they are different + * than the type of the returned vector + * + * For example, _mm_setzero_ps. The _mm implies that the function returns + * a 128-bit vector. The _ps at the end implies that the argument vectors + * contain floats. + * + * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) + * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits + * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); + * // Set packed 8-bit integers + * // 128 bits, 16 chars, per 8 bits + * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, + * 4, 5, 12, 13, 6, 7, 14, 15); + * // Shuffle packed 8-bit integers + * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb + * + * Data (Number, Binary, Byte Index): + +------+------+-------------+------+------+-------------+ + | 1 | 2 | 3 | 4 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary + +------+------+------+------+------+------+------+------+ + | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 5 | 6 | 7 | 8 | Number + +------+------+------+------+------+------+------+------+ + | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + * Index (Byte Index): + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | + +------+------+------+------+------+------+------+------+ + * Result: + +------+------+------+------+------+------+------+------+ + | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index + +------+------+------+------+------+------+------+------+ + | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary + +------+------+------+------+------+------+------+------+ + | 256 | 2 | 5 | 6 | Number + +------+------+------+------+------+------+------+------+ + + +------+------+------+------+------+------+------+------+ + | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index + +------+------+------+------+------+------+------+------+ + | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary + +------+------+------+------+------+------+------+------+ + | 3 | 7 | 4 | 8 | Number + +------+------+------+------+------+------+-------------+ + */ + +/* Set/get methods */ + +/* Constants for use with _mm_prefetch. */ +enum _mm_hint { + _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ + _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ + _MM_HINT_T1 = 2, /* load data to L2 cache only */ + _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ + _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ + _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ + _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ + _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ +}; + +// Loads one cache line of data from address p to a location closer to the +// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx +FORCE_INLINE void _mm_prefetch(const void *p, int i) +{ + (void) i; + __builtin_prefetch(p); +} + +// Pause the processor. This is typically used in spin-wait loops and depending +// on the x86 processor typical values are in the 40-100 cycle range. The +// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most +// Arm cores. Experience with several databases has shown has shown an 'isb' is +// a reasonable approximation. +FORCE_INLINE void _mm_pause() +{ + __asm__ __volatile__("isb\n"); +} + +// Copy the lower single-precision (32-bit) floating-point element of a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 +FORCE_INLINE float _mm_cvtss_f32(__m128 a) +{ + return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); +} + +// Convert the lower single-precision (32-bit) floating-point element in b to a +// double-precision (64-bit) floating-point element, store the result in the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := Convert_FP32_To_FP64(b[31:0]) +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd +FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) +{ + double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_s64( + vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); +#endif +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 +#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 +FORCE_INLINE int _mm_cvtss_si64(__m128 a) +{ +#if defined(__aarch64__) + return vgetq_lane_s64( + vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int64_t) ceil(data); + if (unlikely(diff == 0.5)) { + int64_t f = (int64_t) floor(data); + int64_t c = (int64_t) ceil(data); + return c & 1 ? f : c; + } + return (int64_t) floor(data); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi +FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) +{ + return vreinterpret_m64_s32( + vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si +FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) +{ + return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers with truncation, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 +#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer with truncation, and store the result in dst. +// +// dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 +#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 +FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) +{ + return vgetq_lane_s64( + vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0); +} + +// Sets the 128-bit value to zero +// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx +FORCE_INLINE __m128i _mm_setzero_si128(void) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(0)); +} + +// Clears the four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setzero_ps(void) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(0)); +} + +// Return vector of type __m128d with all elements set to zero. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd +FORCE_INLINE __m128d _mm_setzero_pd(void) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(0)); +#else + return vreinterpretq_m128d_f32(vdupq_n_f32(0)); +#endif +} + +// Sets the four single-precision, floating-point values to w. +// +// r0 := r1 := r2 := r3 := w +// +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set1_ps(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to w. +// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps1(float _w) +{ + return vreinterpretq_m128_f32(vdupq_n_f32(_w)); +} + +// Sets the four single-precision, floating-point values to the four inputs. +// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx +FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Copy single-precision (32-bit) floating-point element a to the lower element +// of dst, and zero the upper 3 elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss +FORCE_INLINE __m128 _mm_set_ss(float a) +{ + float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the four single-precision, floating-point values to the four inputs in +// reverse order. +// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx +FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) +{ + float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +} + +// Sets the 8 signed 16-bit integer values in reverse order. +// +// Return Value +// r0 := w0 +// r1 := w1 +// ... +// r7 := w7 +FORCE_INLINE __m128i _mm_setr_epi16(short w0, + short w1, + short w2, + short w3, + short w4, + short w5, + short w6, + short w7) +{ + int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; + return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); +} + +// Sets the 4 signed 32-bit integer values in reverse order +// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Set packed 64-bit integers in dst with the supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 +FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) +{ + return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); +} + +// Sets the 16 signed 8-bit integer values to b. +// +// r0 := b +// r1 := b +// ... +// r15 := b +// +// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi8(signed char w) +{ + return vreinterpretq_m128i_s8(vdupq_n_s8(w)); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd +FORCE_INLINE __m128d _mm_set1_pd(double d) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vdupq_n_f64(d)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); +#endif +} + +// Sets the 8 signed 16-bit integer values to w. +// +// r0 := w +// r1 := w +// ... +// r7 := w +// +// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set1_epi16(short w) +{ + return vreinterpretq_m128i_s16(vdupq_n_s16(w)); +} + +// Sets the 16 signed 8-bit integer values. +// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi8(signed char b15, + signed char b14, + signed char b13, + signed char b12, + signed char b11, + signed char b10, + signed char b9, + signed char b8, + signed char b7, + signed char b6, + signed char b5, + signed char b4, + signed char b3, + signed char b2, + signed char b1, + signed char b0) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 8 signed 16-bit integer values. +// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx +FORCE_INLINE __m128i _mm_set_epi16(short i7, + short i6, + short i5, + short i4, + short i3, + short i2, + short i1, + short i0) +{ + int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return vreinterpretq_m128i_s16(vld1q_s16(data)); +} + +// Sets the 16 signed 8-bit integer values in reverse order. +// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx +FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, + signed char b1, + signed char b2, + signed char b3, + signed char b4, + signed char b5, + signed char b6, + signed char b7, + signed char b8, + signed char b9, + signed char b10, + signed char b11, + signed char b12, + signed char b13, + signed char b14, + signed char b15) +{ + int8_t ALIGN_STRUCT(16) + data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, + (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, + (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, + (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; + return (__m128i) vld1q_s8(data); +} + +// Sets the 4 signed 32-bit integer values to i. +// +// r0 := i +// r1 := i +// r2 := i +// r3 := I +// +// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set1_epi32(int _i) +{ + return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) +FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); +} + +// Sets the 2 signed 64-bit integer values to i. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x +FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) +{ + return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); +} + +// Sets the 4 signed 32-bit integer values. +// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx +FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) +{ + int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; + return vreinterpretq_m128i_s32(vld1q_s32(data)); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); +} + +// Returns the __m128i structure with its two 64-bit integer values +// initialized to the values of the two 64-bit integers passed in. +// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx +FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) +{ + return _mm_set_epi64x((int64_t) i1, (int64_t) i2); +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd +FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) +{ + double ALIGN_STRUCT(16) data[2] = {e0, e1}; +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); +#else + return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); +#endif +} + +// Set packed double-precision (64-bit) floating-point elements in dst with the +// supplied values in reverse order. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd +FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) +{ + return _mm_set_pd(e0, e1); +} + +// Copy double-precision (64-bit) floating-point element a to the lower element +// of dst, and zero the upper element. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd +FORCE_INLINE __m128d _mm_set_sd(double a) +{ + return _mm_set_pd(0, a); +} + +// Broadcast double-precision (64-bit) floating-point value a to all elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 +#define _mm_set_pd1 _mm_set1_pd + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx +FORCE_INLINE void _mm_store_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 +FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + vst1q_f32(p, vdupq_n_f32(a0)); +} + +// Store the lower single-precision (32-bit) floating-point element from a into +// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[31:0] +// MEM[mem_addr+63:mem_addr+32] := a[31:0] +// MEM[mem_addr+95:mem_addr+64] := a[31:0] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps +#define _mm_store1_ps _mm_store_ps1 + +// Store 4 single-precision (32-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+31:mem_addr] := a[127:96] +// MEM[mem_addr+63:mem_addr+32] := a[95:64] +// MEM[mem_addr+95:mem_addr+64] := a[63:32] +// MEM[mem_addr+127:mem_addr+96] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps +FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) +{ + float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); + float32x4_t rev = vextq_f32(tmp, tmp, 2); + vst1q_f32(p, rev); +} + +// Stores four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx +FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) +{ + vst1q_f32(p, vreinterpretq_f32_m128(a)); +} + +// Stores four 32-bit integer values as (as a __m128i value) at the address p. +// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx +FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores 128-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128 +FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) +{ + vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); +} + +// Stores 64-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64 +FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) +{ + vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); +} + +// Stores 32-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32 +FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) +{ + vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); +} + +// Stores 16-bits of integer data a at the address p. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16 +FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) +{ + vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); +} + +// Stores the lower single - precision, floating - point value. +// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx +FORCE_INLINE void _mm_store_ss(float *p, __m128 a) +{ + vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary +// or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd +FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); +#else + vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); +#endif +} + +// Store the upper double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd +FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. +// +// MEM[mem_addr+63:mem_addr] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd +FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); +#endif +} + +// Store 2 double-precision (64-bit) floating-point elements from a into memory +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// MEM[mem_addr+63:mem_addr] := a[127:64] +// MEM[mem_addr+127:mem_addr+64] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd +FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) +{ + float32x4_t f = vreinterpretq_f32_m128d(a); + _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 +FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); + vst1q_f64((float64_t *) mem_addr, + vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); +#else + float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); + vst1q_f32((float32_t *) mem_addr, + vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// memory. mem_addr does not need to be aligned on any particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd +FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) +{ +#if defined(__aarch64__) + vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); +#else + vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); +#endif +} + +// Store the lower double-precision (64-bit) floating-point element from a into +// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte +// boundary or a general-protection exception may be generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd +#define _mm_store1_pd _mm_store_pd1 + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory. mem_addr does not need to be aligned on any +// particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd +FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) +{ + _mm_store_pd(mem_addr, a); +} + +// Reads the lower 64 bits of b and stores them into the lower 64 bits of a. +// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx +FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) +{ + uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); + uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); + *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); +} + +// Stores the lower two single-precision floating point values of a to the +// address p. +// +// *p0 := a0 +// *p1 := a1 +// +// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx +FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_low_f32(a)); +} + +// Stores the upper two single-precision, floating-point values of a to the +// address p. +// +// *p0 := a2 +// *p1 := a3 +// +// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx +FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) +{ + *p = vreinterpret_m64_f32(vget_high_f32(a)); +} + +// Loads a single single-precision, floating-point value, copying it into all +// four words +// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load1_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_dup_f32(p)); +} + +// Load a single-precision (32-bit) floating-point element from memory into all +// elements of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[63:32] := MEM[mem_addr+31:mem_addr] +// dst[95:64] := MEM[mem_addr+31:mem_addr] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 +#define _mm_load_ps1 _mm_load1_ps + +// Sets the lower two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the upper two values are passed +// through from a. +// +// Return Value +// r0 := *p0 +// r1 := *p1 +// r2 := a2 +// r3 := a3 +// +// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx +FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); +} + +// Load 4 single-precision (32-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[31:0] := MEM[mem_addr+127:mem_addr+96] +// dst[63:32] := MEM[mem_addr+95:mem_addr+64] +// dst[95:64] := MEM[mem_addr+63:mem_addr+32] +// dst[127:96] := MEM[mem_addr+31:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps +FORCE_INLINE __m128 _mm_loadr_ps(const float *p) +{ + float32x4_t v = vrev64q_f32(vld1q_f32(p)); + return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); +} + +// Sets the upper two single-precision, floating-point values with 64 +// bits of data loaded from the address p; the lower two values are passed +// through from a. +// +// r0 := a0 +// r1 := a1 +// r2 := *p0 +// r3 := *p1 +// +// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx +FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx +FORCE_INLINE __m128 _mm_load_ps(const float *p) +{ + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Loads four single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_loadu_ps(const float *p) +{ + // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are + // equivalent for neon + return vreinterpretq_m128_f32(vld1q_f32(p)); +} + +// Load unaligned 16-bit integer from memory into the first element of dst. +// +// dst[15:0] := MEM[mem_addr+15:mem_addr] +// dst[MAX:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 +FORCE_INLINE __m128i _mm_loadu_si16(const void *p) +{ + return vreinterpretq_m128i_s16( + vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); +} + +// Load unaligned 64-bit integer from memory into the first element of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[MAX:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 +FORCE_INLINE __m128i _mm_loadu_si64(const void *p) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower of dst, and zero the upper element. mem_addr does not need to be +// aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd +FORCE_INLINE __m128d _mm_load_sd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from 16-byte aligned memory, floating-point +// values. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd +FORCE_INLINE __m128d _mm_load_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_f64(p)); +#else + const float *fp = (const float *) p; + float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; + return vreinterpretq_m128d_f32(vld1q_f32(data)); +#endif +} + +// Loads two double-precision from unaligned memory, floating-point values. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd +FORCE_INLINE __m128d _mm_loadu_pd(const double *p) +{ + return _mm_load_pd(p); +} + +// Loads an single - precision, floating - point value into the low word and +// clears the upper three words. +// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_load_ss(const float *p) +{ + return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); +} + +// Load 64-bit integer from memory into the first element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 +FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) +{ + /* Load the lower 64 bits of the value pointed to by p into the + * lower 64 bits of the result, zeroing the upper 64 bits of the result. + */ + return vreinterpretq_m128i_s32( + vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); +} + +// Load a double-precision (64-bit) floating-point element from memory into the +// lower element of dst, and copy the upper element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd +FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); +#else + return vreinterpretq_m128d_f32( + vcombine_f32(vld1_f32((const float *) p), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +#endif +} + +// Load 2 double-precision (64-bit) floating-point elements from memory into dst +// in reverse order. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[63:0] := MEM[mem_addr+127:mem_addr+64] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd +FORCE_INLINE __m128d _mm_loadr_pd(const double *p) +{ +#if defined(__aarch64__) + float64x2_t v = vld1q_f64(p); + return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); +#else + int64x2_t v = vld1q_s64((const int64_t *) p); + return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); +#endif +} + +// Sets the low word to the single-precision, floating-point value of b +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) +FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), + vreinterpretq_f32_m128(a), 0)); +} + +// Move the lower double-precision (64-bit) floating-point element from b to the +// lower element of dst, and copy the upper element from a to the upper element +// of dst. +// +// dst[63:0] := b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd +FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_f32( + vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), + vget_high_f32(vreinterpretq_f32_m128d(a)))); +} + +// Copy the lower 64-bit integer in a to the lower element of dst, and zero the +// upper element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 +FORCE_INLINE __m128i _mm_move_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); +} + +// Return vector of type __m128 with undefined elements. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps +FORCE_INLINE __m128 _mm_undefined_ps(void) +{ +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuninitialized" +#endif + __m128 a; + return a; +#if defined(__GNUC__) || defined(__clang__) +#pragma GCC diagnostic pop +#endif +} + +/* Logic/Binary operations */ + +// Computes the bitwise AND-NOT of the four single-precision, floating-point +// values of a and b. +// +// r0 := ~a0 & b0 +// r1 := ~a1 & b1 +// r2 := ~a2 & b2 +// r3 := ~a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx +FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vbicq_s32(vreinterpretq_s32_m128(b), + vreinterpretq_s32_m128(a))); // *NOTE* argument swap +} + +// Compute the bitwise NOT of packed double-precision (64-bit) floating-point +// elements in a and then AND with b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd +FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) +{ + // *NOTE* argument swap + return vreinterpretq_m128d_s64( + vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); +} + +// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the +// 128-bit value in a. +// +// r := (~a) & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vbicq_s32(vreinterpretq_s32_m128i(b), + vreinterpretq_s32_m128i(a))); // *NOTE* argument swap +} + +// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in +// b. +// +// r := a & b +// +// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise AND of the four single-precision, floating-point values +// of a and b. +// +// r0 := a0 & b0 +// r1 := a1 & b1 +// r2 := a2 & b2 +// r3 := a3 & b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise AND of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] AND b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd +FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the four single-precision, floating-point values +// of a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx +FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Computes bitwise EXOR (exclusive-or) of the four single-precision, +// floating-point values of a and b. +// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_s32( + veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); +} + +// Compute the bitwise XOR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] XOR b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd +FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Compute the bitwise OR of packed double-precision (64-bit) floating-point +// elements in a and b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd +FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) +{ + return vreinterpretq_m128d_s64( + vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); +} + +// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. +// +// r := a | b +// +// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx +FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in +// b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx +FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Duplicate the low double-precision (64-bit) floating-point element from a, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd +FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) +{ +#if (__aarch64__) + return vreinterpretq_m128d_f64( + vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); +#else + return vreinterpretq_m128d_u64( + vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); +#endif +} + +// Duplicate odd-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps +FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); +#else + float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); + float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Duplicate even-indexed single-precision (32-bit) floating-point elements +// from a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps +FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) +{ +#if __has_builtin(__builtin_shufflevector) + return vreinterpretq_m128_f32(__builtin_shufflevector( + vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); +#else + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); + float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; + return vreinterpretq_m128_f32(vld1q_f32(data)); +#endif +} + +// Moves the upper two values of B into the lower two values of A. +// +// r3 := a3 +// r2 := a2 +// r1 := b3 +// r0 := b2 +FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); +} + +// Moves the lower two values of B into the upper two values of A. +// +// r3 := b1 +// r2 := b0 +// r1 := a1 +// r0 := a0 +FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8 +FORCE_INLINE int _mm_movemask_pi8(__m64 a) +{ + uint8x8_t input = vreinterpret_u8_m64(a); +#if defined(__aarch64__) + static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7}; + uint8x8_t tmp = vshr_n_u8(input, 7); + return vaddv_u8(vshl_u8(tmp, shift)); +#else + // Refer the implementation of `_mm_movemask_epi8` + uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); + uint32x2_t paired16 = + vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); + uint8x8_t paired32 = + vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); + return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); +#endif +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 +FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 +FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) +{ + return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 +FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) +{ + return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); +} + +// Compute the absolute value of packed signed 32-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// dst[i+31:i] := ABS(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 +FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) +{ + return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); +} + +// Compute the absolute value of packed signed 16-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := ABS(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 +FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) +{ + return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); +} + +// Compute the absolute value of packed signed 8-bit integers in a, and store +// the unsigned results in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := ABS(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 +FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) +{ + return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); +} + +// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift +// the result right by imm8 bytes, and store the low 16 bytes in dst. +// +// tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) +// dst[127:0] := tmp[127:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 +#define _mm_alignr_epi8(a, b, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) >= 32)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + uint8x16_t tmp_low, tmp_high; \ + if (imm >= 16) { \ + const int idx = imm - 16; \ + tmp_low = vreinterpretq_u8_m128i(a); \ + tmp_high = vdupq_n_u8(0); \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpretq_u8_m128i(b); \ + tmp_high = vreinterpretq_u8_m128i(a); \ + ret = \ + vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift +// the result right by imm8 bytes, and store the low 8 bytes in dst. +// +// tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) +// dst[63:0] := tmp[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 +#define _mm_alignr_pi8(a, b, imm) \ + __extension__({ \ + __m64 ret; \ + if (unlikely((imm) >= 16)) { \ + ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ + } else { \ + uint8x8_t tmp_low, tmp_high; \ + if (imm >= 8) { \ + const int idx = imm - 8; \ + tmp_low = vreinterpret_u8_m64(a); \ + tmp_high = vdup_n_u8(0); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } else { \ + const int idx = imm; \ + tmp_low = vreinterpret_u8_m64(b); \ + tmp_high = vreinterpret_u8_m64(a); \ + ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ + } \ + } \ + ret; \ + }) + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of b and places it into the high end of the result. +FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in high +// end of result takes the higher two 32 bit values from b and swaps them and +// places in low end of result. +FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) +{ + float32x2_t a21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) +{ + float32x2_t a03 = vget_low_f32( + vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); + float32x2_t b21 = vget_high_f32( + vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); + return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); + return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); +} + +// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the +// high +FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) +{ + float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) +{ + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) +{ + float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t b22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); + return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) +{ + float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32x2_t a22 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); + float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) +{ + float32x2_t a33 = + vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); + float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); + return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) +{ + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) +{ + float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); +} + +FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) +{ + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32_t b2 = vgetq_lane_f32(b, 2); + float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); + float32x2_t b20 = vset_lane_f32(b2, b00, 1); + return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); +} + +// NEON does not support a general purpose permute intrinsic +// Selects four specific single-precision, floating-point values from a and b, +// based on the mask i. +// +// C equivalent: +// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, +// __constrange(0, 255) int imm) { +// __m128 ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; +// return ret; +// } +// +// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx +#define _mm_shuffle_ps_default(a, b, imm) \ + __extension__({ \ + float32x4_t ret; \ + ret = vmovq_n_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_f32( \ + vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128_f32(ret); \ + }) + +// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) +// int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + float32x4_t _input1 = vreinterpretq_f32_m128(a); \ + float32x4_t _input2 = vreinterpretq_f32_m128(b); \ + float32x4_t _shuf = __builtin_shufflevector( \ + _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128_f32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_ps(a, b, imm) \ + __extension__({ \ + __m128 ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_ps_1032((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_ps_2301((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_ps_0321((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_ps_2103((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_movelh_ps((a), (b)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_ps_1001((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_ps_0101((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 1, 0): \ + ret = _mm_shuffle_ps_3210((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 1, 1): \ + ret = _mm_shuffle_ps_0011((a), (b)); \ + break; \ + case _MM_SHUFFLE(0, 0, 2, 2): \ + ret = _mm_shuffle_ps_0022((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 2, 0, 0): \ + ret = _mm_shuffle_ps_2200((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 0, 2): \ + ret = _mm_shuffle_ps_3202((a), (b)); \ + break; \ + case _MM_SHUFFLE(3, 2, 3, 2): \ + ret = _mm_movehl_ps((b), (a)); \ + break; \ + case _MM_SHUFFLE(1, 1, 3, 3): \ + ret = _mm_shuffle_ps_1133((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 1, 0): \ + ret = _mm_shuffle_ps_2010((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 0, 1): \ + ret = _mm_shuffle_ps_2001((a), (b)); \ + break; \ + case _MM_SHUFFLE(2, 0, 3, 2): \ + ret = _mm_shuffle_ps_2032((a), (b)); \ + break; \ + default: \ + ret = _mm_shuffle_ps_default((a), (b), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Takes the upper 64 bits of a and places it in the low end of the result +// Takes the lower 64 bits of a and places it into the high end of the result. +FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); +} + +// takes the lower two 32-bit values from a and swaps them and places in low end +// of result takes the higher two 32 bit values from a and swaps them and places +// in high end of result. +FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); +} + +// rotates the least significant 32 bits into the most signficant 32 bits, and +// shifts the rest down +FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); +} + +// rotates the most significant 32 bits into the least signficant 32 bits, and +// shifts the rest up +FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) +{ + return vreinterpretq_m128i_s32( + vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); +} + +// gets the lower 64 bits of a, and places it in the upper 64 bits +// gets the lower 64 bits of a and places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) +{ + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the +// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); +} + +// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the +// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and +// places it in the lower 64 bits +FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) +{ + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) +{ + int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) +{ + int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); + int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); + return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); +} + +FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) +{ + int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); + return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); +} + +// Shuffle packed 8-bit integers in a according to shuffle control mask in the +// corresponding 8-bit element of b, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 +FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) +{ + int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a + uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b + uint8x16_t idx_masked = + vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits +#if defined(__aarch64__) + return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); +#elif defined(__GNUC__) + int8x16_t ret; + // %e and %f represent the even and odd D registers + // respectively. + __asm__ __volatile__( + "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" + "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" + : [ret] "=&w"(ret) + : [tbl] "w"(tbl), [idx] "w"(idx_masked)); + return vreinterpretq_m128i_s8(ret); +#else + // use this line if testing on aarch64 + int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; + return vreinterpretq_m128i_s8( + vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), + vtbl2_s8(a_split, vget_high_u8(idx_masked)))); +#endif +} + +// C equivalent: +// __m128i _mm_shuffle_epi32_default(__m128i a, +// __constrange(0, 255) int imm) { +// __m128i ret; +// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; +// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; +// return ret; +// } +#define _mm_shuffle_epi32_default(a, imm) \ + __extension__({ \ + int32x4_t ret; \ + ret = vmovq_n_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ + ret, 1); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ + ret, 2); \ + ret = vsetq_lane_s32( \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ + ret, 3); \ + vreinterpretq_m128i_s32(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) +// int imm) +#if defined(__aarch64__) +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ + }) +#else +#define _mm_shuffle_epi32_splat(a, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ + }) +#endif + +// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. +// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + int32x4_t _input = vreinterpretq_s32_m128i(a); \ + int32x4_t _shuf = __builtin_shufflevector( \ + _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ + ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ + vreinterpretq_m128i_s32(_shuf); \ + }) +#else // generic +#define _mm_shuffle_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + switch (imm) { \ + case _MM_SHUFFLE(1, 0, 3, 2): \ + ret = _mm_shuffle_epi_1032((a)); \ + break; \ + case _MM_SHUFFLE(2, 3, 0, 1): \ + ret = _mm_shuffle_epi_2301((a)); \ + break; \ + case _MM_SHUFFLE(0, 3, 2, 1): \ + ret = _mm_shuffle_epi_0321((a)); \ + break; \ + case _MM_SHUFFLE(2, 1, 0, 3): \ + ret = _mm_shuffle_epi_2103((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 1, 0): \ + ret = _mm_shuffle_epi_1010((a)); \ + break; \ + case _MM_SHUFFLE(1, 0, 0, 1): \ + ret = _mm_shuffle_epi_1001((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 0, 1): \ + ret = _mm_shuffle_epi_0101((a)); \ + break; \ + case _MM_SHUFFLE(2, 2, 1, 1): \ + ret = _mm_shuffle_epi_2211((a)); \ + break; \ + case _MM_SHUFFLE(0, 1, 2, 2): \ + ret = _mm_shuffle_epi_0122((a)); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 2): \ + ret = _mm_shuffle_epi_3332((a)); \ + break; \ + case _MM_SHUFFLE(0, 0, 0, 0): \ + ret = _mm_shuffle_epi32_splat((a), 0); \ + break; \ + case _MM_SHUFFLE(1, 1, 1, 1): \ + ret = _mm_shuffle_epi32_splat((a), 1); \ + break; \ + case _MM_SHUFFLE(2, 2, 2, 2): \ + ret = _mm_shuffle_epi32_splat((a), 2); \ + break; \ + case _MM_SHUFFLE(3, 3, 3, 3): \ + ret = _mm_shuffle_epi32_splat((a), 3); \ + break; \ + default: \ + ret = _mm_shuffle_epi32_default((a), (imm)); \ + break; \ + } \ + ret; \ + }) +#endif + +// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) +// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflelo_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t lowBits = vget_low_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ + 1); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ + 2); \ + ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ + 3); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflelo_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ + (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) +#endif + +// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified +// by imm. +// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, +// __constrange(0,255) int +// imm) +#define _mm_shufflehi_epi16_function(a, imm) \ + __extension__({ \ + int16x8_t ret = vreinterpretq_s16_m128i(a); \ + int16x4_t highBits = vget_high_s16(ret); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ + 5); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ + 6); \ + ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ + 7); \ + vreinterpretq_m128i_s16(ret); \ + }) + +// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, +// __constrange(0,255) int imm) +#if __has_builtin(__builtin_shufflevector) +#define _mm_shufflehi_epi16(a, imm) \ + __extension__({ \ + int16x8_t _input = vreinterpretq_s16_m128i(a); \ + int16x8_t _shuf = __builtin_shufflevector( \ + _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ + (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ + (((imm) >> 6) & 0x3) + 4); \ + vreinterpretq_m128i_s16(_shuf); \ + }) +#else // generic +#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) +#endif + +// Shuffle double-precision (64-bit) floating-point elements using the control +// in imm8, and store the results in dst. +// +// dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] +// dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd +#if __has_builtin(__builtin_shufflevector) +#define _mm_shuffle_pd(a, b, imm8) \ + vreinterpretq_m128d_s64(__builtin_shufflevector( \ + vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ + ((imm8 & 0x2) >> 1) + 2)) +#else +#define _mm_shuffle_pd(a, b, imm8) \ + _mm_castsi128_pd(_mm_set_epi64x( \ + vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ + vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) +#endif + +// Blend packed 16-bit integers from a and b using control mask imm8, and store +// the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[j] +// dst[i+15:i] := b[i+15:i] +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, +// __constrange(0,255) int imm) +#define _mm_blend_epi16(a, b, imm) \ + __extension__({ \ + const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ + ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \ + uint16x8_t _mask_vec = vld1q_u16(_mask); \ + uint16x8_t _a = vreinterpretq_u16_m128i(a); \ + uint16x8_t _b = vreinterpretq_u16_m128i(b); \ + vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ + }) + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using control mask imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd +#define _mm_blend_pd(a, b, imm) \ + __extension__({ \ + const uint64_t _mask[2] = { \ + ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ + ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)}; \ + uint64x2_t _mask_vec = vld1q_u64(_mask); \ + uint64x2_t _a = vreinterpretq_u64_m128d(a); \ + uint64x2_t _b = vreinterpretq_u64_m128d(b); \ + vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \ + }) + +// Blend packed 8-bit integers from a and b using mask, and store the results in +// dst. +// +// FOR j := 0 to 15 +// i := j*8 +// IF mask[i+7] +// dst[i+7:i] := b[i+7:i] +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint8x16_t mask = + vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); + uint8x16_t a = vreinterpretq_u8_m128i(_a); + uint8x16_t b = vreinterpretq_u8_m128i(_b); + return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); +} + +/* Shifts */ + + +// Shift packed 16-bit integers in a right by imm while shifting in sign +// bits, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 +FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) +{ + const int count = (imm & ~15) ? 15 : imm; + return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx +#define _mm_slli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm)) <= 0) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s16( \ + vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. : +// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx +// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) +FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) +{ + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 31)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s32( + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); +} + +// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and +// store the results in dst. +FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) +{ + if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ + return a; + if (unlikely(imm > 63)) + return _mm_setzero_si128(); + return vreinterpretq_m128i_s64( + vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); +} + +// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// IF imm8[7:0] > 15 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 +#define _mm_srli_epi16(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely(imm) == 0) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 16)) { \ + ret = vreinterpretq_m128i_u16( \ + vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 +// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_u32( \ + vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and +// store the results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF imm8[7:0] > 63 +// dst[i+63:i] := 0 +// ELSE +// dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 +#define _mm_srli_epi64(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 64)) { \ + ret = vreinterpretq_m128i_u64( \ + vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ + } else { \ + ret = _mm_setzero_si128(); \ + } \ + ret; \ + }) + +// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, +// and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*32 +// IF imm8[7:0] > 31 +// dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) +// ELSE +// dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 +// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) +#define _mm_srai_epi32(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) == 0)) { \ + ret = a; \ + } \ + if (likely(0 < (imm) && (imm) < 32)) { \ + ret = vreinterpretq_m128i_s32( \ + vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ + } else { \ + ret = vreinterpretq_m128i_s32( \ + vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ + } \ + ret; \ + }) + +// Shifts the 128 - bit value in a right by imm bytes while shifting in +// zeros.imm must be an immediate. +// +// r := srl(a, imm*8) +// +// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx +// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_srli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8( \ + vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ + } \ + ret; \ + }) + +// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm +// must be an immediate. +// +// r := a << (imm * 8) +// +// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx +// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) +#define _mm_slli_si128(a, imm) \ + __extension__({ \ + __m128i ret; \ + if (unlikely((imm) <= 0)) { \ + ret = a; \ + } \ + if (unlikely((imm) > 15)) { \ + ret = _mm_setzero_si128(); \ + } else { \ + ret = vreinterpretq_m128i_s8(vextq_s8( \ + vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ + } \ + ret; \ + }) + +// Compute the square root of packed double-precision (64-bit) floating-point +// elements in a, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd +FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); +#else + double a0 = sqrt(((double *) &a)[0]); + double a1 = sqrt(((double *) &a)[1]); + return _mm_set_pd(a1, a0); +#endif +} + +// Compute the square root of the lower double-precision (64-bit) floating-point +// element in b, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd +FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_sqrt_pd(b)); +#else + return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0])); +#endif +} + +// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// ... +// r7 := a7 << count +// +// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16((int16_t) c); + return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// r2 := a2 << count +// r3 := a3 << count +// +// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32((int32_t) c); + return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while +// shifting in zeros. +// +// r0 := a0 << count +// r1 := a1 << count +// +// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64((int64_t) c); + return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); +} + +// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// ... +// r7 := srl(a7, count) +// +// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 15)) + return _mm_setzero_si128(); + + int16x8_t vc = vdupq_n_s16(-(int16_t) c); + return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); +} + +// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// r2 := srl(a2, count) +// r3 := srl(a3, count) +// +// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 31)) + return _mm_setzero_si128(); + + int32x4_t vc = vdupq_n_s32(-(int32_t) c); + return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); +} + +// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits +// while shifting in zeros. +// +// r0 := srl(a0, count) +// r1 := srl(a1, count) +// +// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) +{ + uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); + if (unlikely(c > 63)) + return _mm_setzero_si128(); + + int64x2_t vc = vdupq_n_s64(-(int64_t) c); + return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); +} + +// NEON does not provide a version of this function. +// Creates a 16-bit mask from the most significant bits of the 16 signed or +// unsigned 8-bit integers in a and zero extends the upper bits. +// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_epi8(__m128i a) +{ + // Use increasingly wide shifts+adds to collect the sign bits + // together. + // Since the widening shifts would be rather confusing to follow in little + // endian, everything will be illustrated in big endian order instead. This + // has a different result - the bits would actually be reversed on a big + // endian machine. + + // Starting input (only half the elements are shown): + // 89 ff 1d c0 00 10 99 33 + uint8x16_t input = vreinterpretq_u8_m128i(a); + + // Shift out everything but the sign bits with an unsigned shift right. + // + // Bytes of the vector:: + // 89 ff 1d c0 00 10 99 33 + // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) + // | | | | | | | | + // 01 01 00 01 00 00 01 00 + // + // Bits of first important lane(s): + // 10001001 (89) + // \______ + // | + // 00000001 (01) + uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); + + // Merge the even lanes together with a 16-bit unsigned shift right + add. + // 'xx' represents garbage data which will be ignored in the final result. + // In the important bytes, the add functions like a binary OR. + // + // 01 01 00 01 00 00 01 00 + // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) + // \| \| \| \| + // xx 03 xx 01 xx 00 xx 02 + // + // 00000001 00000001 (01 01) + // \_______ | + // \| + // xxxxxxxx xxxxxx11 (xx 03) + uint32x4_t paired16 = + vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); + + // Repeat with a wider 32-bit shift + add. + // xx 03 xx 01 xx 00 xx 02 + // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> + // 14)) + // \| \| + // xx xx xx 0d xx xx xx 02 + // + // 00000011 00000001 (03 01) + // \\_____ || + // '----.\|| + // xxxxxxxx xxxx1101 (xx 0d) + uint64x2_t paired32 = + vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); + + // Last, an even wider 64-bit shift + add to get our result in the low 8 bit + // lanes. xx xx xx 0d xx xx xx 02 + // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> + // 28)) + // \| + // xx xx xx xx xx xx xx d2 + // + // 00001101 00000010 (0d 02) + // \ \___ | | + // '---. \| | + // xxxxxxxx 11010010 (xx d2) + uint8x16_t paired64 = + vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); + + // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. + // xx xx xx xx xx xx xx d2 + // || return paired64[0] + // d2 + // Note: Little endian would return the correct value 4b (01001011) instead. + return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); +} + +// Set each bit of mask dst based on the most significant bit of the +// corresponding packed double-precision (64-bit) floating-point element in a. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd +FORCE_INLINE int _mm_movemask_pd(__m128d a) +{ + uint64x2_t input = vreinterpretq_u64_m128d(a); + uint64x2_t high_bits = vshrq_n_u64(input, 63); + return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 +FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) +{ + return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); +} + +// Copy the 64-bit integer a to the lower element of dst, and zero the upper +// element. +// +// dst[63:0] := a[63:0] +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 +FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) +{ + return vreinterpretq_m128i_s64( + vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); +} + +// NEON does not provide this method +// Creates a 4-bit mask from the most significant bits of the four +// single-precision, floating-point values. +// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx +FORCE_INLINE int _mm_movemask_ps(__m128 a) +{ + uint32x4_t input = vreinterpretq_u32_m128(a); +#if defined(__aarch64__) + static const int32x4_t shift = {0, 1, 2, 3}; + uint32x4_t tmp = vshrq_n_u32(input, 31); + return vaddvq_u32(vshlq_u32(tmp, shift)); +#else + // Uses the exact same method as _mm_movemask_epi8, see that for details. + // Shift out everything but the sign bits with a 32-bit unsigned shift + // right. + uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); + // Merge the two pairs together with a 64-bit unsigned shift right + add. + uint8x16_t paired = + vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); + // Extract the result. + return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); +#endif +} + +// Compute the bitwise NOT of a and then AND with a 128-bit vector containing +// all 1's, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones +FORCE_INLINE int _mm_test_all_ones(__m128i a) +{ + return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == + ~(uint64_t) 0; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and +// mask, and return 1 if the result is zero, otherwise return 0. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros +FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) +{ + int64x2_t a_and_mask = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); + return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 + : 1; +} + +/* Math operations */ + +// Subtracts the four single-precision, floating-point values of a and b. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Subtract the lower single-precision (32-bit) floating-point element in b from +// the lower single-precision (32-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper 3 packed elements from +// a to the upper elements of dst. +// +// dst[31:0] := a[31:0] - b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss +FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_sub_ps(a, b)); +} + +// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, +// and store the results in dst. +// r0 := a0 - b0 +// r1 := a1 - b1 +FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or +// unsigned 32-bit integers of a. +// +// r0 := a0 - b0 +// r1 := a1 - b1 +// r2 := a2 - b2 +// r3 := a3 - b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx +FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 +FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 +FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. +// +// dst[63:0] := a[63:0] - b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 +FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit +// integers of a and saturates.. +// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx +FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit +// integers of a and saturates. +// +// r0 := UnsignedSaturate(a0 - b0) +// r1 := UnsignedSaturate(a1 - b1) +// ... +// r15 := UnsignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +#define _mm_ucomieq_sd _mm_comieq_sd +#define _mm_ucomige_sd _mm_comige_sd +#define _mm_ucomigt_sd _mm_comigt_sd +#define _mm_ucomile_sd _mm_comile_sd +#define _mm_ucomilt_sd _mm_comilt_sd +#define _mm_ucomineq_sd _mm_comineq_sd + +// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r15 := SignedSaturate(a15 - b15) +// +// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers +// of a and saturates. +// +// r0 := SignedSaturate(a0 - b0) +// r1 := SignedSaturate(a1 - b1) +// ... +// r7 := SignedSaturate(a7 - b7) +// +// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) +FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Subtract packed double-precision (64-bit) floating-point elements in b from +// packed double-precision (64-bit) floating-point elements in a, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd +FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] - db[0]; + c[1] = da[1] - db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Subtract the lower double-precision (64-bit) floating-point element in b from +// the lower double-precision (64-bit) floating-point element in a, store the +// result in the lower element of dst, and copy the upper element from a to the +// upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd +FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_sub_pd(a, b)); +} + +// Add packed unsigned 16-bit integers in a and b using saturation, and store +// the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 +FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Negate packed 8-bit integers in a when the corresponding signed +// 8-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..15 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) +{ + int8x16_t a = vreinterpretq_s8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); +#else + int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') + // based on ltMask + int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); + // res = masked & (~zeroMask) + int8x16_t res = vbicq_s8(masked, zeroMask); + + return vreinterpretq_m128i_s8(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed +// 16-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..7 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); +#else + int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative + // 'a') based on ltMask + int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); + // res = masked & (~zeroMask) + int16x8_t res = vbicq_s16(masked, zeroMask); + return vreinterpretq_m128i_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed +// 32-bit integer in b is negative, and store the results in dst. +// Element in dst are zeroed out when the corresponding element +// in b is zero. +// +// for i in 0..3 +// if b[i] < 0 +// r[i] := -a[i] +// else if b[i] == 0 +// r[i] := 0 +// else +// r[i] := a[i] +// fi +// done +FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); +#else + int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); +#endif + + // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative + // 'a') based on ltMask + int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); + // res = masked & (~zeroMask) + int32x4_t res = vbicq_s32(masked, zeroMask); + return vreinterpretq_m128i_s32(res); +} + +// Negate packed 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 3 +// i := j*16 +// IF b[i+15:i] < 0 +// dst[i+15:i] := -(a[i+15:i]) +// ELSE IF b[i+15:i] == 0 +// dst[i+15:i] := 0 +// ELSE +// dst[i+15:i] := a[i+15:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 +FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) +{ + int16x4_t a = vreinterpret_s16_m64(_a); + int16x4_t b = vreinterpret_s16_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFF : 0 + uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); + + // (b == 0) ? 0xFFFF : 0 +#if defined(__aarch64__) + int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); +#else + int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') + // based on ltMask + int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); + // res = masked & (~zeroMask) + int16x4_t res = vbic_s16(masked, zeroMask); + + return vreinterpret_m64_s16(res); +} + +// Negate packed 32-bit integers in a when the corresponding signed 32-bit +// integer in b is negative, and store the results in dst. Element in dst are +// zeroed out when the corresponding element in b is zero. +// +// FOR j := 0 to 1 +// i := j*32 +// IF b[i+31:i] < 0 +// dst[i+31:i] := -(a[i+31:i]) +// ELSE IF b[i+31:i] == 0 +// dst[i+31:i] := 0 +// ELSE +// dst[i+31:i] := a[i+31:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 +FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) +{ + int32x2_t a = vreinterpret_s32_m64(_a); + int32x2_t b = vreinterpret_s32_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFFFFFFFF : 0 + uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); + + // (b == 0) ? 0xFFFFFFFF : 0 +#if defined(__aarch64__) + int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); +#else + int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') + // based on ltMask + int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); + // res = masked & (~zeroMask) + int32x2_t res = vbic_s32(masked, zeroMask); + + return vreinterpret_m64_s32(res); +} + +// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer +// in b is negative, and store the results in dst. Element in dst are zeroed out +// when the corresponding element in b is zero. +// +// FOR j := 0 to 7 +// i := j*8 +// IF b[i+7:i] < 0 +// dst[i+7:i] := -(a[i+7:i]) +// ELSE IF b[i+7:i] == 0 +// dst[i+7:i] := 0 +// ELSE +// dst[i+7:i] := a[i+7:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 +FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) +{ + int8x8_t a = vreinterpret_s8_m64(_a); + int8x8_t b = vreinterpret_s8_m64(_b); + + // signed shift right: faster than vclt + // (b < 0) ? 0xFF : 0 + uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); + + // (b == 0) ? 0xFF : 0 +#if defined(__aarch64__) + int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); +#else + int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); +#endif + + // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') + // based on ltMask + int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); + // res = masked & (~zeroMask) + int8x8_t res = vbic_s8(masked, zeroMask); + + return vreinterpret_m64_s8(res); +} + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 +FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16( + vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 +FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Average packed unsigned 8-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb +#define _m_pavgb(a, b) _mm_avg_pu8(a, b) + +// Average packed unsigned 16-bit integers in a and b, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw +#define _m_pavgw(a, b) _mm_avg_pu16(a, b) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw +#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw +#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Create mask from the most significant bit of each 8-bit element in a, and +// store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb +#define _m_pmovmskb(a) _mm_movemask_pi8(a) + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Computes the average of the 16 unsigned 8-bit integers in a and the 16 +// unsigned 8-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r15 := (a15 + b15) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Shift a left by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128 +#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) + +// Shift a right by imm8 bytes while shifting in zeros, and store the results in +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128 +#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) + +// Computes the average of the 8 unsigned 16-bit integers in a and the 8 +// unsigned 16-bit integers in b and rounds. +// +// r0 := (a0 + b0) / 2 +// r1 := (a1 + b1) / 2 +// ... +// r7 := (a7 + b7) / 2 +// +// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) +{ + return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), + vreinterpretq_u16_m128i(b)); +} + +// Adds the four single-precision, floating-point values of a and b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Add packed double-precision (64-bit) floating-point elements in a and b, and +// store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd +FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1] + db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add the lower double-precision (64-bit) floating-point element in a and b, +// store the result in the lower element of dst, and copy the upper element from +// a to the upper element of dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// dst[127:64] := a[127:64] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd +FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_add_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] + db[0]; + c[1] = da[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Add 64-bit integers a and b, and store the result in dst. +// +// dst[63:0] := a[63:0] + b[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 +FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) +{ + return vreinterpret_m64_s64( + vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); +} + +// adds the scalar single-precision floating point values of a and b. +// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx +FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) +{ + float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); + float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); + // the upper values in the result must be the remnants of . + return vreinterpretq_m128_f32(vaddq_f32(a, value)); +} + +// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s64( + vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +} + +// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or +// unsigned 32-bit integers in b. +// +// r0 := a0 + b0 +// r1 := a1 + b1 +// r2 := a2 + b2 +// r3 := a3 + b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or +// unsigned 16-bit integers in b. +// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or +// unsigned 8-bit integers in b. +// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) +FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b +// and saturates. +// +// r0 := SignedSaturate(a0 + b0) +// r1 := SignedSaturate(a1 + b1) +// ... +// r7 := SignedSaturate(a7 + b7) +// +// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Add packed signed 8-bit integers in a and b using saturation, and store the +// results in dst. +// +// FOR j := 0 to 15 +// i := j*8 +// dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 +FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in +// b and saturates.. +// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx +FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or +// unsigned 16-bit integers from b. +// +// r0 := (a0 * b0)[15:0] +// r1 := (a1 * b1)[15:0] +// ... +// r7 := (a7 * b7)[15:0] +// +// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or +// unsigned 32-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// tmp[31:0] := a[i+15:i] * b[i+15:i] +// dst[i+15:i] := tmp[31:16] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw +#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) + +// Multiplies the four single-precision, floating-point values of a and b. +// +// r0 := a0 * b0 +// r1 := a1 * b1 +// r2 := a2 * b2 +// r3 := a3 * b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx +FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_f32( + vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Multiply packed double-precision (64-bit) floating-point elements in a and b, +// and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd +FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] * db[0]; + c[1] = da[1] * db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Multiply the lower double-precision (64-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper element +// from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd +FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_mul_pd(a, b)); +} + +// Multiply the lower single-precision (32-bit) floating-point element in a and +// b, store the result in the lower element of dst, and copy the upper 3 packed +// elements from a to the upper elements of dst. +// +// dst[31:0] := a[31:0] * b[31:0] +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss +FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_mul_ps(a, b)); +} + +// Multiply the low unsigned 32-bit integers from each packed 64-bit element in +// a and b, and store the unsigned 64-bit results in dst. +// +// r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) +// r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) +FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) +{ + // vmull_u32 upcasts instead of masking, so we downcast. + uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); + uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); + return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); +} + +// Multiply the low unsigned 32-bit integers from a and b, and store the +// unsigned 64-bit result in dst. +// +// dst[63:0] := a[31:0] * b[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 +FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) +{ + return vreinterpret_m64_u64(vget_low_u64( + vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); +} + +// Multiply the low signed 32-bit integers from each packed 64-bit element in +// a and b, and store the signed 64-bit results in dst. +// +// r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 +// r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 +FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) +{ + // vmull_s32 upcasts instead of masking, so we downcast. + int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); + int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0) + (a1 * b1) +// r1 := (a2 * b2) + (a3 * b3) +// r2 := (a4 * b4) + (a5 * b5) +// r3 := (a6 * b6) + (a7 * b7) +// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx +FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) +{ + int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); + int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); + + return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. mem_addr does not need to be aligned +// on any particular boundary. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128 +FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) +{ + int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x16_t masked = + vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), + vreinterpretq_s8_m128(b)); + vst1q_s8((int8_t *) mem_addr, masked); +} + +// Multiply packed signed 16-bit integers in a and b, producing intermediate +// signed 32-bit integers. Shift right by 15 bits while rounding up, and store +// the packed 16-bit integers in dst. +// +// r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) +// r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) +// r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) +// ... +// r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) +FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) +{ + // Has issues due to saturation + // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); + + // Multiply + int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), + vget_low_s16(vreinterpretq_s16_m128i(b))); + int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), + vget_high_s16(vreinterpretq_s16_m128i(b))); + + // Rounding narrowing shift right + // narrow = (int16_t)((mul + 16384) >> 15); + int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); + int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); + + // Join together + return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); +} + +// Vertically multiply each unsigned 8-bit integer from a with the corresponding +// signed 8-bit integer from b, producing intermediate signed 16-bit integers. +// Horizontally add adjacent pairs of intermediate signed 16-bit integers, +// and pack the saturated results in dst. +// +// FOR j := 0 to 7 +// i := j*16 +// dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + +// a[i+7:i]*b[i+7:i] ) +// ENDFOR +FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + uint8x16_t a = vreinterpretq_u8_m128i(_a); + int8x16_t b = vreinterpretq_s8_m128i(_b); + int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vmovl_s8(vget_low_s8(b))); + int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), + vmovl_s8(vget_high_s8(b))); + return vreinterpretq_m128i_s16( + vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); +#else + // This would be much simpler if x86 would choose to zero extend OR sign + // extend, not both. This could probably be optimized better. + uint16x8_t a = vreinterpretq_u16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + + // Zero extend a + int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); + int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); + + // Sign extend by shifting left then shifting right. + int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); + int16x8_t b_odd = vshrq_n_s16(b, 8); + + // multiply + int16x8_t prod1 = vmulq_s16(a_even, b_even); + int16x8_t prod2 = vmulq_s16(a_odd, b_odd); + + // saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); +#endif +} + +// Computes the fused multiple add product of 32-bit floating point numbers. +// +// Return Value +// Multiplies A and B, and adds C to the temporary result before returning it. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd +FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), + vreinterpretq_f32_m128(b), + vreinterpretq_f32_m128(a))); +#else + return _mm_add_ps(_mm_mul_ps(a, b), c); +#endif +} + +// Alternatively add and subtract packed double-precision (64-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := j*64 +// IF ((j & 1) == 0) +// dst[i+63:i] := a[i+63:i] - b[i+63:i] +// ELSE +// dst[i+63:i] := a[i+63:i] + b[i+63:i] +// FI +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd +FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) +{ + __m128d mask = _mm_set_pd(1.0f, -1.0f); +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), + vreinterpretq_f64_m128d(b), + vreinterpretq_f64_m128d(mask))); +#else + return _mm_add_pd(_mm_mul_pd(b, mask), a); +#endif +} + +// Alternatively add and subtract packed single-precision (32-bit) +// floating-point elements in a to/from packed elements in b, and store the +// results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps +FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) +{ + __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; + return _mm_fmadd_ps(b, mask, a); +} + +// Horizontally add adjacent pairs of double-precision (64-bit) floating-point +// elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd +FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[] = {da[0] + da[1], db[0] + db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce two +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of 64-bit elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 +FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) +{ + uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + uint16_t r4 = t[4] + t[5] + t[6] + t[7]; + uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); + return (__m128i) vsetq_lane_u16(r4, r, 4); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 +FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) +{ + uint16x4_t t = + vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); + uint16_t r0 = t[0] + t[1] + t[2] + t[3]; + return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); +} + +// Compute the absolute differences of packed unsigned 8-bit integers in a and +// b, then horizontally sum each consecutive 8 differences to produce four +// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low +// 16 bits of dst. +// +// FOR j := 0 to 7 +// i := j*8 +// tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) +// ENDFOR +// dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + +// tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw +#define _m_psadbw(a, b) _mm_sad_pu8(a, b) + +// Divides the four single-precision, floating-point values of a and b. +// +// r0 := a0 / b0 +// r1 := a1 / b1 +// r2 := a2 / b2 +// r3 := a3 / b3 +// +// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV + return vreinterpretq_m128_f32( + vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); +#endif + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); +#endif +} + +// Divides the scalar single-precision floating point value of a by b. +// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx +FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Divide packed double-precision (64-bit) floating-point elements in a by +// packed elements in b, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// dst[i+63:i] := a[i+63:i] / b[i+63:i] +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd +FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2]; + c[0] = da[0] / db[0]; + c[1] = da[1] / db[1]; + return vld1q_f32((float32_t *) c); +#endif +} + +// Divide the lower double-precision (64-bit) floating-point element in a by the +// lower double-precision (64-bit) floating-point element in b, store the result +// in the lower element of dst, and copy the upper element from a to the upper +// element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd +FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + float64x2_t tmp = + vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); + return vreinterpretq_m128d_f64( + vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); +#else + return _mm_move_sd(a, _mm_div_pd(a, b)); +#endif +} + +// Compute the approximate reciprocal of packed single-precision (32-bit) +// floating-point elements in a, and store the results in dst. The maximum +// relative error for this approximation is less than 1.5*2^-12. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps +FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) +{ + float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#if SSE2NEON_PRECISE_DIV + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); +#endif + return vreinterpretq_m128_f32(recip); +} + +// Compute the approximate reciprocal of the lower single-precision (32-bit) +// floating-point element in a, store the result in the lower element of dst, +// and copy the upper 3 packed elements from a to the upper elements of dst. The +// maximum relative error for this approximation is less than 1.5*2^-12. +// +// dst[31:0] := (1.0 / a[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss +FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) +{ + return _mm_move_ss(a, _mm_rcp_ps(a)); +} + +// Computes the approximations of square roots of the four single-precision, +// floating-point values of a. First computes reciprocal square roots and then +// reciprocals of the four values. +// +// r0 := sqrt(a0) +// r1 := sqrt(a1) +// r2 := sqrt(a2) +// r3 := sqrt(a3) +// +// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) +{ +#if SSE2NEON_PRECISE_SQRT + float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + + // Test for vrsqrteq_f32(0) -> positive infinity case. + // Change to zero, so that s * 1/sqrt(s) result is zero too. + const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); + const uint32x4_t div_by_zero = + vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); + recip = vreinterpretq_f32_u32( + vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); + + // Additional Netwon-Raphson iteration for accuracy + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + recip = vmulq_f32( + vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), + recip); + + // sqrt(s) = s * 1/sqrt(s) + return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); +#elif defined(__aarch64__) + return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); +#else + float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); + float32x4_t sq = vrecpeq_f32(recipsq); + return vreinterpretq_m128_f32(sq); +#endif +} + +// Computes the approximation of the square root of the scalar single-precision +// floating point value of in. +// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx +FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) +{ + float32_t value = + vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); +} + +// Computes the approximations of the reciprocal square roots of the four +// single-precision floating point values of in. +// The current precision is 1% error. +// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx +FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) +{ + float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); +#if SSE2NEON_PRECISE_SQRT + // Additional Netwon-Raphson iteration for accuracy + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); + out = vmulq_f32( + out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); +#endif + return vreinterpretq_m128_f32(out); +} + +// Compute the approximate reciprocal square root of the lower single-precision +// (32-bit) floating-point element in a, store the result in the lower element +// of dst, and copy the upper 3 packed elements from a to the upper elements of +// dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss +FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) +{ + return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 +#define _m_pmaxsw(a, b) _mm_max_pi16(a, b) + +// Computes the maximums of the four single-precision, floating-point values of +// a and b. +// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_b, _a), _a, _b); +#else + return vreinterpretq_m128_f32( + vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed maximum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 +#define _m_pmaxub(a, b) _mm_max_pu8(a, b) + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Compare packed signed 16-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 +#define _m_pminsw(a, b) _mm_min_pi16(a, b) + +// Computes the minima of the four single-precision, floating-point values of a +// and b. +// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) +{ +#if SSE2NEON_PRECISE_MINMAX + float32x4_t _a = vreinterpretq_f32_m128(a); + float32x4_t _b = vreinterpretq_f32_m128(b); + return vbslq_f32(vcltq_f32(_a, _b), _a, _b); +#else + return vreinterpretq_m128_f32( + vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#endif +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) +{ + return vreinterpret_m64_u8( + vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); +} + +// Compare packed unsigned 8-bit integers in a and b, and store packed minimum +// values in dst. +// +// FOR j := 0 to 7 +// i := j*8 +// dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 +#define _m_pminub(a, b) _mm_min_pu8(a, b) + +// Computes the maximum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx +FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the minimum of the two lower scalar single-precision floating point +// values of a and b. +// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx +FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) +{ + float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); + return vreinterpretq_m128_f32( + vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); +} + +// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed maximum values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd +FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; + d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the maximum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd +FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_max_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2] = {fmax(da[0], db[0]), da[1]}; + return vld1q_f32((float32_t *) c); +#endif +} + +// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the +// 16 unsigned 8-bit integers from b. +// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx +FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b, +// and store packed minimum values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd +FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; + d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b, store the minimum value in the lower element of dst, and copy the upper +// element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd +FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_min_pd(a, b)); +#else + double *da = (double *) &a; + double *db = (double *) &b; + double c[2] = {fmin(da[0], db[0]), da[1]}; + return vld1q_f32((float32_t *) c); +#endif +} + +// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 +FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 +FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Compare packed signed 8-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 +FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed unsigned 16-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 +FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); +} + +// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 +// signed 16-bit integers from b. +// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// epi versions of min/max +// Computes the pariwise maximums of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 > b0) ? a0 : b0 +// r1 := (a1 > b1) ? a1 : b1 +// r2 := (a2 > b2) ? a2 : b2 +// r3 := (a3 > b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx +FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Computes the pariwise minima of the four signed 32-bit integer values of a +// and b. +// +// A 128-bit parameter that can be defined with the following equations: +// r0 := (a0 < b0) ? a0 : b0 +// r1 := (a1 < b1) ? a1 : b1 +// r2 := (a2 < b2) ? a2 : b2 +// r3 := (a3 < b3) ? a3 : b3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx +FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s32( + vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed maximum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Compare packed unsigned 32-bit integers in a and b, and store packed minimum +// values in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 +FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 +FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) +{ + return vreinterpret_m64_u16(vshrn_n_u32( + vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); +} + +// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit +// integers from b. +// +// r0 := (a0 * b0)[31:16] +// r1 := (a1 * b1)[31:16] +// ... +// r7 := (a7 * b7)[31:16] +// +// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) +{ + /* FIXME: issue with large values because of result saturation */ + // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), + // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return + // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); + int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ + int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +} + +// Multiply the packed unsigned 16-bit integers in a and b, producing +// intermediate 32-bit integers, and store the high 16 bits of the intermediate +// integers in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 +FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) +{ + uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab3210 = vmull_u16(a3210, b3210); +#if defined(__aarch64__) + uint32x4_t ab7654 = + vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); + uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), + vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r); +#else + uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); + uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); + uint32x4_t ab7654 = vmull_u16(a7654, b7654); + uint16x8x2_t r = + vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); + return vreinterpretq_m128i_u16(r.val[1]); +#endif +} + +// Computes pairwise add of each argument as single-precision, floating-point +// values a and b. +// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx +FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_f32( + vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); +#endif +} + +// Computes pairwise add of each argument as a 16-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) +{ + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); +#if defined(__aarch64__) + return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); +#else + return vreinterpretq_m128i_s16( + vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), + vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); +#endif +} + +// Horizontally subtract adjacent pairs of double-precision (64-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd +FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vsubq_f64( + vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)), + vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)))); +#else + double *da = (double *) &_a; + double *db = (double *) &_b; + double c[] = {da[0] - da[1], db[0] - db[1]}; + return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); +#endif +} + +// Horizontally substract adjacent pairs of single-precision (32-bit) +// floating-point elements in a and b, and pack the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps +FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32(vsubq_f32( + vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), + vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); +#else + float32x4x2_t c = + vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); + return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); +#endif +} + +// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the +// signed 16-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 +FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) +{ + return vreinterpret_m64_s16( + vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); +} + +// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the +// signed 32-bit results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 +FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) +{ + return vreinterpret_m64_s32( + vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); +} + +// Computes pairwise difference of each argument as a 16-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Subtract + return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); +} + +// Computes saturated pairwise sub of each argument as a 16-bit signed +// integer values a and b. +FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated add + return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); +#endif +} + +// Computes saturated pairwise difference of each argument as a 16-bit signed +// integer values a and b. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 +FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) +{ +#if defined(__aarch64__) + int16x8_t a = vreinterpretq_s16_m128i(_a); + int16x8_t b = vreinterpretq_s16_m128i(_b); + return vreinterpretq_s64_s16( + vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); +#else + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|a4|a6|b0|b2|b4|b6] + // [a1|a3|a5|a7|b1|b3|b5|b7] + int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); + int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); + // Saturated subtract + return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); +#endif +} + +// Computes pairwise add of each argument as a 32-bit signed or unsigned integer +// values a and b. +FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) +{ + int32x4_t a = vreinterpretq_s32_m128i(_a); + int32x4_t b = vreinterpretq_s32_m128i(_b); + return vreinterpretq_m128i_s32( + vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), + vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); +} + +// Computes pairwise difference of each argument as a 32-bit signed or unsigned +// integer values a and b. +FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) +{ + int64x2_t a = vreinterpretq_s64_m128i(_a); + int64x2_t b = vreinterpretq_s64_m128i(_b); + // Interleave using vshrn/vmovn + // [a0|a2|b0|b2] + // [a1|a2|b1|b3] + int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); + int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); + // Subtract + return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); +} + +// Kahan summation for accurate summation of floating-point numbers. +// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html +FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) +{ + y -= *c; + float t = *sum + y; + *c = (t - *sum) - y; + *sum = t; +} + +// Conditionally multiply the packed single-precision (32-bit) floating-point +// elements in a and b using the high 4 bits in imm8, sum the four products, +// and conditionally store the sum in dst using the low 4 bits of imm. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps +FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) +{ +#if defined(__aarch64__) + /* shortcuts */ + if (imm == 0xFF) { + return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); + } + if (imm == 0x7F) { + float32x4_t m = _mm_mul_ps(a, b); + m[3] = 0; + return _mm_set1_ps(vaddvq_f32(m)); + } +#endif + + float s = 0, c = 0; + float32x4_t f32a = vreinterpretq_f32_m128(a); + float32x4_t f32b = vreinterpretq_f32_m128(b); + + /* To improve the accuracy of floating-point summation, Kahan algorithm + * is used for each operation. + */ + if (imm & (1 << 4)) + _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); + if (imm & (1 << 5)) + _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); + if (imm & (1 << 6)) + _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); + if (imm & (1 << 7)) + _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); + s += c; + + float32x4_t res = { + (imm & 0x1) ? s : 0, + (imm & 0x2) ? s : 0, + (imm & 0x4) ? s : 0, + (imm & 0x8) ? s : 0, + }; + return vreinterpretq_m128_f32(res); +} + +/* Compare operations */ + +// Compares for less than +// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) +FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmplt_ps(a, b)); +} + +// Compares for greater than. +// +// r0 := (a0 > b0) ? 0xffffffff : 0x0 +// r1 := (a1 > b1) ? 0xffffffff : 0x0 +// r2 := (a2 > b2) ? 0xffffffff : 0x0 +// r3 := (a3 > b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); +} + +// Compares for greater than or equal. +// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) +FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpge_ps(a, b)); +} + +// Compares for less than or equal. +// +// r0 := (a0 <= b0) ? 0xffffffff : 0x0 +// r1 := (a1 <= b1) ? 0xffffffff : 0x0 +// r2 := (a2 <= b2) ? 0xffffffff : 0x0 +// r3 := (a3 <= b3) ? 0xffffffff : 0x0 +// +// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) +FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmple_ps(a, b)); +} + +// Compares for equality. +// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +} + +// Compares for equality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) +FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); +} + +// Compares for inequality. +// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) +{ + return vreinterpretq_m128_u32(vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); +} + +// Compares for inequality. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) +FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) +{ + return _mm_cmplt_ps(a, b); +} + +// Compares for not greater than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) +{ + return _mm_cmplt_ss(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) +{ + return _mm_cmple_ps(a, b); +} + +// Compares for not greater than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) +{ + return _mm_cmple_ss(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) +{ + return _mm_cmpgt_ps(a, b); +} + +// Compares for not less than or equal. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) +{ + return _mm_cmpgt_ss(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) +{ + return _mm_cmpge_ps(a, b); +} + +// Compares for not less than. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) +FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) +{ + return _mm_cmpge_ss(a, b); +} + +// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or +// unsigned 8-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for equality, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd +FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for equality, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd +FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd +FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than-or-equal, store the result in the lower element of dst, +// and copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd +FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmpge_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or +// unsigned 16-bit integers in b for equality. +// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compare packed 32-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compare packed 64-bit integers in a and b for equality, and store the results +// in dst +FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); +#else + // ARMv7 lacks vceqq_u64 + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); +#endif +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for lesser than. +// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx +FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd +FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd +FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmplt_pd(a, b)); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd +FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( + vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); +#else + // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) + uint32x4_t cmp = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint32x4_t swapped = vrev64q_u32(cmp); + return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-equal, store the result in the lower element of dst, and copy the +// upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd +FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) +{ + return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for not-greater-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd +FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) +{ + return _mm_cmplt_pd(a, b); +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for not-greater-than-or-equal, store the result in the lower element of +// dst, and copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd +FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) +{ + return _mm_cmplt_sd(a, b); +} + +// Compare the lower double-precision (64-bit) floating-point element in a and b +// for equality, and return the boolean result (0 or 1). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd +FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return !!vgetq_lane_u64(vceqq_f64(a, b), 0); +#else + uint32x4_t a_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); + uint32x4_t b_not_nan = + vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); + uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), + vreinterpretq_u64_u32(a_eq_b)); + return !!vgetq_lane_u64(and_results, 0); +#endif +} + +// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xff : 0x0 +// r1 := (a1 > b1) ? 0xff : 0x0 +// ... +// r15 := (a15 > b15) ? 0xff : 0x0 +// +// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for greater-than, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd +FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for greater-than, store the result in the lower element of dst, and copy +// the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd +FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare packed double-precision (64-bit) floating-point elements in a and b +// for less-than-or-equal, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd +FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_u64( + vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compare the lower double-precision (64-bit) floating-point elements in a and +// b for less-than-or-equal, store the result in the lower element of dst, and +// copy the upper element from a to the upper element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd +FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return _mm_move_sd(a, _mm_cmple_pd(a, b)); +#else + // expand "_mm_cmpge_pd()" to reduce unnecessary operations + uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); + uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); + uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); + uint64_t d[2]; + d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); + d[1] = a1; + + return vreinterpretq_m128d_u64(vld1q_u64(d)); +#endif +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for less than. +// +// r0 := (a0 < b0) ? 0xffff : 0x0 +// r1 := (a1 < b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 < b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + +// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers +// in b for greater than. +// +// r0 := (a0 > b0) ? 0xffff : 0x0 +// r1 := (a1 > b1) ? 0xffff : 0x0 +// ... +// r7 := (a7 > b7) ? 0xffff : 0x0 +// +// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +} + + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for less than. +// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers +// in b for greater than. +// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32( + vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + +// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers +// in b for greater than. +FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_u64( + vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); +#else + return vreinterpretq_m128i_s64(vshrq_n_s64( + vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), + 63)); +#endif +} + +// Compares the four 32-bit floats in a and b to check if any values are NaN. +// Ordered compare between each value returns true for "orderable" and false for +// "not orderable" (NaN). +// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see +// also: +// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean +// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics +FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) +{ + // Note: NEON does not have ordered compare builtin + // Need to compare a eq a and b eq b to check for NaN + // Do AND of results to get final + uint32x4_t ceqaa = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t ceqbb = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); +} + +// Compares for ordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) +FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpord_ps(a, b)); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) +{ + uint32x4_t f32a = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t f32b = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); +} + +// Compares for unordered. +// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) +FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) +{ + return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than operation. : +// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important +// note!! The documentation on MSDN is incorrect! If either of the values is a +// NAN the docs say you will get a one, but in fact, it will return a zero!! +FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) +{ + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_lt_b = + vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than operation. : +// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx +FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_gt_b = + vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a less than or equal operation. : +// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx +FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_le_b = + vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using a greater than or equal operation. : +// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx +FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_ge_b = + vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an equality operation. : +// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx +FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) +{ + // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); + uint32x4_t a_eq_b = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); + return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; +} + +// Compares the lower single-precision floating point scalar values of a and b +// using an inequality operation. : +// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx +FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) +{ + // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), + // vreinterpretq_f32_m128(b)), 0); + uint32x4_t a_not_nan = + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); + uint32x4_t b_not_nan = + vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); + uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); + uint32x4_t a_neq_b = vmvnq_u32( + vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; +} + +// according to the documentation, these intrinsics behave the same as the +// non-'u' versions. We'll just alias them here. +#define _mm_ucomieq_ss _mm_comieq_ss +#define _mm_ucomige_ss _mm_comige_ss +#define _mm_ucomigt_ss _mm_comigt_ss +#define _mm_ucomile_ss _mm_comile_ss +#define _mm_ucomilt_ss _mm_comilt_ss +#define _mm_ucomineq_ss _mm_comineq_ss + +/* Conversions */ + +// Convert packed signed 32-bit integers in b to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, and copy the upper 2 packed elements from a to the upper elements of +// dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps +FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss +FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the signed 32-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss +#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) + +// Convert the signed 64-bit integer b to a single-precision (32-bit) +// floating-point element, store the result in the lower element of dst, and +// copy the upper 3 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int64_To_FP32(b[63:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss +FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) +{ + return vreinterpretq_m128_f32( + vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); +} + +// Convert the lower single-precision (32-bit) floating-point element in a to a +// 32-bit integer, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si +FORCE_INLINE int _mm_cvt_ss2si(__m128 a) +{ +#if defined(__aarch64__) + return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); +#else + float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + float32_t diff = data - floor(data); + if (diff > 0.5) + return (int32_t) ceil(data); + if (unlikely(diff == 0.5)) { + int32_t f = (int32_t) floor(data); + int32_t c = (int32_t) ceil(data); + return c & 1 ? f : c; + } + return (int32_t) floor(data); +#endif +} + +// Convert packed 16-bit integers in a to packed single-precision (32-bit) +// floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps +FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); +} + +// Convert packed 32-bit integers in b to packed single-precision (32-bit) +// floating-point elements, store the results in the lower 2 elements of dst, +// and copy the upper 2 packed elements from a to the upper elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(b[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(b[63:32]) +// dst[95:64] := a[95:64] +// dst[127:96] := a[127:96] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps +FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) +{ + return vreinterpretq_m128_f32( + vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), + vget_high_f32(vreinterpretq_f32_m128(a)))); +} + +// Convert packed signed 32-bit integers in a to packed single-precision +// (32-bit) floating-point elements, store the results in the lower 2 elements +// of dst, then covert the packed signed 32-bit integers in b to +// single-precision (32-bit) floating-point element, and store the results in +// the upper 2 elements of dst. +// +// dst[31:0] := Convert_Int32_To_FP32(a[31:0]) +// dst[63:32] := Convert_Int32_To_FP32(a[63:32]) +// dst[95:64] := Convert_Int32_To_FP32(b[31:0]) +// dst[127:96] := Convert_Int32_To_FP32(b[63:32]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps +FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); +} + +// Convert the lower packed 8-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps +FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32( + vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); +} + +// Convert packed unsigned 16-bit integers in a to packed single-precision +// (32-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 3 +// i := j*16 +// m := j*32 +// dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps +FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) +{ + return vreinterpretq_m128_f32( + vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); +} + +// Convert the lower packed unsigned 8-bit integers in a to packed +// single-precision (32-bit) floating-point elements, and store the results in +// dst. +// +// FOR j := 0 to 3 +// i := j*8 +// m := j*32 +// dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps +FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_u32( + vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values using truncate. +// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) +{ + return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 +FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) +{ +#if defined(__aarch64__) + return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); +#else + double ret = *((double *) &a); + return (int64_t) ret; +#endif +} + +// Convert the lower double-precision (64-bit) floating-point element in a to a +// 64-bit integer with truncation, and store the result in dst. +// +// dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x +#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) + +// Converts the four signed 32-bit integer values of a to single-precision, +// floating-point values +// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx +FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) +{ + return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// m := j*64 +// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd +FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); +#else + double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); + double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Convert packed signed 32-bit integers in a to packed double-precision +// (64-bit) floating-point elements, and store the results in dst. +// +// FOR j := 0 to 1 +// i := j*32 +// m := j*64 +// dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd +FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); +#else + double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); + double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_u16(u16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx +FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_u32(u32x4); +} + +// Converts the two unsigned 8-bit integers in the lower 16 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) +{ + uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ + uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the four unsigned 8-bit integers in the lower 16 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + return vreinterpretq_m128i_s16(s16x8); +} + +// Converts the four unsigned 8-bit integers in the lower 32 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ + return vreinterpretq_m128i_s32(s32x4); +} + +// Converts the two signed 8-bit integers in the lower 32 bits to four +// signed 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) +{ + int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ + int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four signed 16-bit integers in the lower 64 bits to four signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) +{ + return vreinterpretq_m128i_s32( + vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); +} + +// Converts the two signed 16-bit integers in the lower 32 bits two signed +// 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) +{ + int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ + int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_s64(s64x2); +} + +// Converts the four unsigned 16-bit integers in the lower 64 bits to four +// unsigned 32-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) +{ + return vreinterpretq_m128i_u32( + vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); +} + +// Converts the two unsigned 16-bit integers in the lower 32 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) +{ + uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ + uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ + uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ + return vreinterpretq_m128i_u64(u64x2); +} + +// Converts the two unsigned 32-bit integers in the lower 64 bits to two +// unsigned 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) +{ + return vreinterpretq_m128i_u64( + vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); +} + +// Converts the two signed 32-bit integers in the lower 64 bits to two signed +// 64-bit integers. +FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) +{ + return vreinterpretq_m128i_s64( + vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); +} + +// Converts the four single-precision, floating-point values of a to signed +// 32-bit integer values. +// +// r0 := (int) a0 +// r1 := (int) a1 +// r2 := (int) a2 +// r3 := (int) a3 +// +// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx +// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A +// does not support! It is supported on ARMv8-A however. +FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); +#else + uint32x4_t signmask = vdupq_n_u32(0x80000000); + float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( + vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ + int32x4_t r_trunc = + vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ + int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + float32x4_t delta = vsubq_f32( + vreinterpretq_f32_m128(a), + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 16-bit integers, and store the results in dst. Note: this intrinsic +// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and +// 0x7FFFFFFF. +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 +FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) +{ + return vreinterpret_m64_s16( + vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); +} + +// Copy the lower 32-bit integer in a to dst. +// +// dst[31:0] := a[31:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 +FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) +{ + return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 +FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) +{ + return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); +} + +// Copy the lower 64-bit integer in a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x +#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) + +// Moves 32-bit integer a to the least significant 32 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +// r2 := 0x0 +// r3 := 0x0 +// +// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) +{ + return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); +} + +// Moves 64-bit integer a to the least significant 64 bits of an __m128 object, +// zero extending the upper bits. +// +// r0 := a +// r1 := 0x0 +FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) +{ + return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); +} + +// Cast vector of type __m128 to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd +FORCE_INLINE __m128d _mm_castps_pd(__m128 a) +{ + return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); +} + +// Applies a type cast to reinterpret four 32-bit floating point values passed +// in as a 128-bit parameter as packed 32-bit integers. +// https://msdn.microsoft.com/en-us/library/bb514099.aspx +FORCE_INLINE __m128i _mm_castps_si128(__m128 a) +{ + return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); +} + +// Cast vector of type __m128i to type __m128d. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd +FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); +#else + return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); +#endif +} + +// Applies a type cast to reinterpret four 32-bit integers passed in as a +// 128-bit parameter as packed 32-bit floating point values. +// https://msdn.microsoft.com/en-us/library/bb514029.aspx +FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) +{ + return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); +} + +// Loads 128-bit value. : +// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx +FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd +FORCE_INLINE __m128d _mm_load1_pd(const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); +#else + return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into the +// upper element of dst, and copy the lower element from a to dst. mem_addr does +// not need to be aligned on any particular boundary. +// +// dst[63:0] := a[63:0] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd +FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); +#else + return vreinterpretq_m128d_f32(vcombine_f32( + vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); +#endif +} + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 +#define _mm_load_pd1 _mm_load1_pd + +// Load a double-precision (64-bit) floating-point element from memory into both +// elements of dst. +// +// dst[63:0] := MEM[mem_addr+63:mem_addr] +// dst[127:64] := MEM[mem_addr+63:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd +#define _mm_loaddup_pd _mm_load1_pd + +// Loads 128-bit value. : +// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx +FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) +{ + return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); +} + +// Load unaligned 32-bit integer from memory into the first element of dst. +// +// dst[31:0] := MEM[mem_addr+31:mem_addr] +// dst[MAX:32] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 +FORCE_INLINE __m128i _mm_loadu_si32(const void *p) +{ + return vreinterpretq_m128i_s32( + vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); +} + +// Convert packed double-precision (64-bit) floating-point elements in a to +// packed single-precision (32-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// k := 64*j +// dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) +// ENDFOR +// dst[127:64] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps +FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) +{ +#if defined(__aarch64__) + float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); + return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); +#else + float a0 = (float) ((double *) &a)[0]; + float a1 = (float) ((double *) &a)[1]; + return _mm_set_ps(0, 0, a1, a0); +#endif +} + +// Copy the lower double-precision (64-bit) floating-point element of a to dst. +// +// dst[63:0] := a[63:0] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 +FORCE_INLINE double _mm_cvtsd_f64(__m128d a) +{ +#if defined(__aarch64__) + return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); +#else + return ((double *) &a)[0]; +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed double-precision (64-bit) floating-point elements, and store the +// results in dst. +// +// FOR j := 0 to 1 +// i := 64*j +// k := 32*j +// dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd +FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); +#else + double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); + double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); + return _mm_set_pd(a1, a0); +#endif +} + +// Cast vector of type __m128d to type __m128i. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 +FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) +{ + return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); +} + +// Cast vector of type __m128d to type __m128. This intrinsic is only used for +// compilation and does not generate any instructions, thus it has zero latency. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps +FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) +{ + return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps +FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) +{ + // Use a signed shift right to create a mask with the sign bit + uint32x4_t mask = + vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed single-precision (32-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps +FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) +{ + const uint32_t ALIGN_STRUCT(16) + data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, + ((imm8) & (1 << 1)) ? UINT32_MAX : 0, + ((imm8) & (1 << 2)) ? UINT32_MAX : 0, + ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; + uint32x4_t mask = vld1q_u32(data); + float32x4_t a = vreinterpretq_f32_m128(_a); + float32x4_t b = vreinterpretq_f32_m128(_b); + return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); +} + +// Blend packed double-precision (64-bit) floating-point elements from a and b +// using mask, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd +FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) +{ + uint64x2_t mask = + vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); +#if defined(__aarch64__) + float64x2_t a = vreinterpretq_f64_m128d(_a); + float64x2_t b = vreinterpretq_f64_m128d(_b); + return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); +#else + uint64x2_t a = vreinterpretq_u64_m128d(_a); + uint64x2_t b = vreinterpretq_u64_m128d(_b); + return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); +#endif +} + +typedef struct { + uint16_t res0; + uint8_t res1 : 6; + uint8_t bit22 : 1; + uint8_t bit23 : 1; + uint8_t res2; +#if defined(__aarch64__) + uint32_t res3; +#endif +} fpcr_bitfield; + +// Macro: Set the rounding mode bits of the MXCSR control and status register to +// the value in unsigned 32-bit integer a. The rounding mode may contain any of +// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, +// _MM_ROUND_TOWARD_ZERO +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE +FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) +{ + union { + fpcr_bitfield field; +#if defined(__aarch64__) + uint64_t value; +#else + uint32_t value; +#endif + } r; + +#if defined(__aarch64__) + asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ +#else + asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ +#endif + + switch (rounding) { + case _MM_ROUND_TOWARD_ZERO: + r.field.bit22 = 1; + r.field.bit23 = 1; + break; + case _MM_ROUND_DOWN: + r.field.bit22 = 0; + r.field.bit23 = 1; + break; + case _MM_ROUND_UP: + r.field.bit22 = 1; + r.field.bit23 = 0; + break; + default: //_MM_ROUND_NEAREST + r.field.bit22 = 0; + r.field.bit23 = 0; + } + +#if defined(__aarch64__) + asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ +#else + asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ +#endif +} + +FORCE_INLINE void _mm_setcsr(unsigned int a) +{ + _MM_SET_ROUNDING_MODE(a); +} + +// Round the packed single-precision (32-bit) floating-point elements in a using +// the rounding parameter, and store the results as packed single-precision +// floating-point elements in dst. +// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps +FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) +{ +#if defined(__aarch64__) + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); + default: //_MM_FROUND_CUR_DIRECTION + return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); + } +#else + float *v_float = (float *) &a; + __m128 zero, neg_inf, pos_inf; + + switch (rounding) { + case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): + return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); + case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): + return (__m128){floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])}; + case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): + return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), + ceilf(v_float[3])}; + case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): + zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); + neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), + floorf(v_float[2]), floorf(v_float[3])); + pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), + ceilf(v_float[2]), ceilf(v_float[3])); + return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); + default: //_MM_FROUND_CUR_DIRECTION + return (__m128){roundf(v_float[0]), roundf(v_float[1]), + roundf(v_float[2]), roundf(v_float[3])}; + } +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi +FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) +{ +#if defined(__aarch64__) + return vreinterpret_m64_s32( + vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)))); +#else + return vreinterpret_m64_s32( + vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128( + _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))))); +#endif +} + +// Convert packed single-precision (32-bit) floating-point elements in a to +// packed 32-bit integers, and store the results in dst. +// +// FOR j := 0 to 1 +// i := 32*j +// dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) +// ENDFOR +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 +#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) + +// Round the packed single-precision (32-bit) floating-point elements in a up to +// an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps +FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b up to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := CEIL(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss +FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) +{ + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); +} + +// Round the packed single-precision (32-bit) floating-point elements in a down +// to an integer value, and store the results as packed single-precision +// floating-point elements in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps +FORCE_INLINE __m128 _mm_floor_ps(__m128 a) +{ + return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); +} + +// Round the lower single-precision (32-bit) floating-point element in b down to +// an integer value, store the result as a single-precision floating-point +// element in the lower element of dst, and copy the upper 3 packed elements +// from a to the upper elements of dst. +// +// dst[31:0] := FLOOR(b[31:0]) +// dst[127:32] := a[127:32] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss +FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) +{ + return _mm_move_ss( + a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); +} + +// Load 128-bits of integer data from unaligned memory into dst. This intrinsic +// may perform better than _mm_loadu_si128 when the data crosses a cache line +// boundary. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 +#define _mm_lddqu_si128 _mm_loadu_si128 + +/* Miscellaneous Operations */ + +// Shifts the 8 signed 16-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// ... +// r7 := a7 >> count +// +// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx +FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (unlikely(c > 15)) + return _mm_cmplt_epi16(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); +} + +// Shifts the 4 signed 32-bit integers in a right by count bits while shifting +// in the sign bit. +// +// r0 := a0 >> count +// r1 := a1 >> count +// r2 := a2 >> count +// r3 := a3 >> count +// +// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx +FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) +{ + int64_t c = (int64_t) vget_low_s64((int64x2_t) count); + if (unlikely(c > 31)) + return _mm_cmplt_epi32(a, _mm_setzero_si128()); + return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); +} + +// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and +// saturates. +// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s8( + vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), + vqmovn_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// ... +// r7 := UnsignedSaturate(a7) +// r8 := UnsignedSaturate(b0) +// r9 := UnsignedSaturate(b1) +// ... +// r15 := UnsignedSaturate(b7) +// +// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx +FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) +{ + return vreinterpretq_m128i_u8( + vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), + vqmovun_s16(vreinterpretq_s16_m128i(b)))); +} + +// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers +// and saturates. +// +// r0 := SignedSaturate(a0) +// r1 := SignedSaturate(a1) +// r2 := SignedSaturate(a2) +// r3 := SignedSaturate(a3) +// r4 := SignedSaturate(b0) +// r5 := SignedSaturate(b1) +// r6 := SignedSaturate(b2) +// r7 := SignedSaturate(b3) +// +// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_s16( + vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), + vqmovn_s32(vreinterpretq_s32_m128i(b)))); +} + +// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit +// integers and saturates. +// +// r0 := UnsignedSaturate(a0) +// r1 := UnsignedSaturate(a1) +// r2 := UnsignedSaturate(a2) +// r3 := UnsignedSaturate(a3) +// r4 := UnsignedSaturate(b0) +// r5 := UnsignedSaturate(b1) +// r6 := UnsignedSaturate(b2) +// r7 := UnsignedSaturate(b3) +FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u16( + vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), + vqmovun_s32(vreinterpretq_s32_m128i(b)))); +} + +// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// ... +// r14 := a7 +// r15 := b7 +// +// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the +// lower 4 signed or unsigned 16-bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// r4 := a2 +// r5 := b2 +// r6 := a3 +// r7 := b3 +// +// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx +FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the +// lower 2 signed or unsigned 32 - bit integers in b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) +{ + int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); +} + +// Selects and interleaves the lower two single-precision, floating-point values +// from a and b. +// +// r0 := a0 +// r1 := b0 +// r2 := a1 +// r3 := b1 +// +// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the low half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[63:0] +// dst[127:64] := src2[63:0] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd +FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), + vget_low_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Unpack and interleave double-precision (64-bit) floating-point elements from +// the high half of a and b, and store the results in dst. +// +// DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { +// dst[63:0] := src1[127:64] +// dst[127:64] := src2[127:64] +// RETURN dst[127:0] +// } +// dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd +FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128d_f64( + vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); +#else + return vreinterpretq_m128d_s64( + vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), + vget_high_s64(vreinterpretq_s64_m128d(b)))); +#endif +} + +// Selects and interleaves the upper two single-precision, floating-point values +// from a and b. +// +// r0 := a2 +// r1 := b2 +// r2 := a3 +// r3 := b3 +// +// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx +FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128_f32( + vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); +#else + float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); + float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); + float32x2x2_t result = vzip_f32(a1, b1); + return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper +// 8 signed or unsigned 8-bit integers in b. +// +// r0 := a8 +// r1 := b8 +// r2 := a9 +// r3 := b9 +// ... +// r14 := a15 +// r15 := b15 +// +// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s8( + vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); +#else + int8x8_t a1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); + int8x8_t b1 = + vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); + int8x8x2_t result = vzip_s8(a1, b1); + return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the +// upper 4 signed or unsigned 16-bit integers in b. +// +// r0 := a4 +// r1 := b4 +// r2 := a5 +// r3 := b5 +// r4 := a6 +// r5 := b6 +// r6 := a7 +// r7 := b7 +// +// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s16( + vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); +#else + int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); + int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); + int16x4x2_t result = vzip_s16(a1, b1); + return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the +// upper 2 signed or unsigned 32-bit integers in b. +// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx +FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) +{ +#if defined(__aarch64__) + return vreinterpretq_m128i_s32( + vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +#else + int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); + int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); + int32x2x2_t result = vzip_s32(a1, b1); + return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); +#endif +} + +// Interleaves the upper signed or unsigned 64-bit integer in a with the +// upper signed or unsigned 64-bit integer in b. +// +// r0 := a1 +// r1 := b1 +FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) +{ + int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); + int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); + return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); +} + +// Horizontally compute the minimum amongst the packed unsigned 16-bit integers +// in a, store the minimum and index in dst, and zero the remaining bits in dst. +// +// index[2:0] := 0 +// min[15:0] := a[15:0] +// FOR j := 0 to 7 +// i := j*16 +// IF a[i+15:i] < min[15:0] +// index[2:0] := j +// min[15:0] := a[i+15:i] +// FI +// ENDFOR +// dst[15:0] := min[15:0] +// dst[18:16] := index[2:0] +// dst[127:19] := 0 +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 +FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) +{ + __m128i dst; + uint16_t min, idx = 0; + // Find the minimum value +#if defined(__aarch64__) + min = vminvq_u16(vreinterpretq_u16_m128i(a)); +#else + __m64 tmp; + tmp = vreinterpret_m64_u16( + vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), + vget_high_u16(vreinterpretq_u16_m128i(a)))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + tmp = vreinterpret_m64_u16( + vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); + min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); +#endif + // Get the index of the minimum value + int i; + for (i = 0; i < 8; i++) { + if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { + idx = (uint16_t) i; + break; + } + a = _mm_srli_si128(a, 2); + } + // Generate result + dst = _mm_setzero_si128(); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); + dst = vreinterpretq_m128i_u16( + vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); + return dst; +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the CF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 +FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), + vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Compute the bitwise AND of 128 bits (representing integer data) in a and b, +// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the +// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, +// otherwise set CF to 0. Return the ZF value. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 +FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) +{ + int64x2_t s64 = + vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); + return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); +} + +// Extracts the selected signed or unsigned 8-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) +#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) + +// Inserts the least significant 8 bits of b into the selected 8-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, +// __constrange(0,16) int imm) +#define _mm_insert_epi8(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s8( \ + vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 16-bit integer from a and zero +// extends. +// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx +// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) +#define _mm_extract_epi16(a, imm) \ + vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) + +// Extract a 16-bit integer from a, selected with imm8, and store the result in +// the lower element of dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16 +#define _mm_extract_pi16(a, imm) \ + (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) + +// Inserts the least significant 16 bits of b into the selected 16-bit integer +// of a. +// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx +// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, +// __constrange(0,8) int imm) +#define _mm_insert_epi16(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s16( \ + vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ + }) + +// Copy a to dst, and insert the 16-bit integer i into dst at the location +// specified by imm8. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 +#define _mm_insert_pi16(a, b, imm) \ + __extension__({ \ + vreinterpret_m64_s16( \ + vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 32-bit integer from a and zero +// extends. +// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) +#define _mm_extract_epi32(a, imm) \ + vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) + +// Extracts the selected single-precision (32-bit) floating-point from a. +// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) +#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) + +// Inserts the least significant 32 bits of b into the selected 32-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, +// __constrange(0,4) int imm) +#define _mm_insert_epi32(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s32( \ + vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ + }) + +// Extracts the selected signed or unsigned 64-bit integer from a and zero +// extends. +// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) +#define _mm_extract_epi64(a, imm) \ + vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) + +// Inserts the least significant 64 bits of b into the selected 64-bit integer +// of a. +// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, +// __constrange(0,2) int imm) +#define _mm_insert_epi64(a, b, imm) \ + __extension__({ \ + vreinterpretq_m128i_s64( \ + vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ + }) + +// Count the number of bits set to 1 in unsigned 32-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 +FORCE_INLINE int _mm_popcnt_u32(unsigned int a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcount) + return __builtin_popcount(a); +#else + return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); +#endif +#else + uint32_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + + vst1_u32(&count, count32x2_val); + return count; +#endif +} + +// Count the number of bits set to 1 in unsigned 64-bit integer a, and +// return that count in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 +FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) +{ +#if defined(__aarch64__) +#if __has_builtin(__builtin_popcountll) + return __builtin_popcountll(a); +#else + return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); +#endif +#else + uint64_t count = 0; + uint8x8_t input_val, count8x8_val; + uint16x4_t count16x4_val; + uint32x2_t count32x2_val; + uint64x1_t count64x1_val; + + input_val = vld1_u8((uint8_t *) &a); + count8x8_val = vcnt_u8(input_val); + count16x4_val = vpaddl_u8(count8x8_val); + count32x2_val = vpaddl_u16(count16x4_val); + count64x1_val = vpaddl_u32(count32x2_val); + vst1_u64(&count, count64x1_val); + return count; +#endif +} + +// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision +// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the +// transposed matrix in these vectors (row0 now contains column 0, etc.). +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ + do { \ + float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ + float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ + row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ + vget_low_f32(ROW23.val[0])); \ + row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ + vget_low_f32(ROW23.val[1])); \ + row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ + vget_high_f32(ROW23.val[0])); \ + row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ + vget_high_f32(ROW23.val[1])); \ + } while (0) + +/* Crypto Extensions */ + +#if defined(__ARM_FEATURE_CRYPTO) +// Wraps vmull_p64 +FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); + poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); + return vreinterpretq_u64_p128(vmull_p64(a, b)); +} +#else // ARMv7 polyfill +// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. +// +// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a +// 64-bit->128-bit polynomial multiply. +// +// It needs some work and is somewhat slow, but it is still faster than all +// known scalar methods. +// +// Algorithm adapted to C from +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted +// from "Fast Software Polynomial Multiplication on ARM Processors Using the +// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab +// (https://hal.inria.fr/hal-01506572) +static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) +{ + poly8x8_t a = vreinterpret_p8_u64(_a); + poly8x8_t b = vreinterpret_p8_u64(_b); + + // Masks + uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), + vcreate_u8(0x00000000ffffffff)); + uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), + vcreate_u8(0x0000000000000000)); + + // Do the multiplies, rotating with vext to get all combinations + uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 + uint8x16_t e = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 + uint8x16_t f = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 + uint8x16_t g = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 + uint8x16_t h = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 + uint8x16_t i = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 + uint8x16_t j = + vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 + uint8x16_t k = + vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 + + // Add cross products + uint8x16_t l = veorq_u8(e, f); // L = E + F + uint8x16_t m = veorq_u8(g, h); // M = G + H + uint8x16_t n = veorq_u8(i, j); // N = I + J + + // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL + // instructions. +#if defined(__aarch64__) + uint8x16_t lm_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t lm_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); + uint8x16_t nk_p0 = vreinterpretq_u8_u64( + vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); + uint8x16_t nk_p1 = vreinterpretq_u8_u64( + vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); +#else + uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); + uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); + uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); + uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); +#endif + // t0 = (L) (P0 + P1) << 8 + // t1 = (M) (P2 + P3) << 16 + uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); + uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); + uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); + + // t2 = (N) (P4 + P5) << 24 + // t3 = (K) (P6 + P7) << 32 + uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); + uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); + uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); + + // De-interleave +#if defined(__aarch64__) + uint8x16_t t0 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t1 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); + uint8x16_t t2 = vreinterpretq_u8_u64( + vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); + uint8x16_t t3 = vreinterpretq_u8_u64( + vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); +#else + uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); + uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); + uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); + uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); +#endif + // Shift the cross products + uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 + uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 + uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 + uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 + + // Accumulate the products + uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); + uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); + uint8x16_t mix = veorq_u8(d, cross1); + uint8x16_t r = veorq_u8(mix, cross2); + return vreinterpretq_u64_u8(r); +} +#endif // ARMv7 polyfill + +// Perform a carry-less multiplication of two 64-bit integers, selected from a +// and b according to imm8, and store the results in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 +FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) +{ + uint64x2_t a = vreinterpretq_u64_m128i(_a); + uint64x2_t b = vreinterpretq_u64_m128i(_b); + switch (imm & 0x11) { + case 0x00: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); + case 0x01: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); + case 0x10: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); + case 0x11: + return vreinterpretq_m128i_u64( + _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); + default: + abort(); + } +} + +#if !defined(__ARM_FEATURE_CRYPTO) +/* clang-format off */ +#define SSE2NEON_AES_DATA(w) \ + { \ + w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ + w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ + w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ + w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ + w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ + w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ + w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ + w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ + w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ + w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ + w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ + w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ + w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ + w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ + w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ + w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ + w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ + w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ + w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ + w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ + w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ + w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ + w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ + w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ + w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ + w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ + w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ + w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ + w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ + w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ + w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ + w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ + w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ + w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ + w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ + w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ + w(0xb0), w(0x54), w(0xbb), w(0x16) \ + } +/* clang-format on */ + +/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ +#define SSE2NEON_AES_H0(x) (x) +static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); +#undef SSE2NEON_AES_H0 + +// In the absence of crypto extensions, implement aesenc using regular neon +// intrinsics instead. See: +// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ +// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and +// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 +// for more information Reproduced with permission of the author. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) +{ +#if defined(__aarch64__) + static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, + 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, + 0xc, 0x1, 0x6, 0xb}; + static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, + 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; + + uint8x16_t v; + uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); + + // shift rows + w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); + + // sub bytes + v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); + v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); + + // mix columns + w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); + w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); + w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); + + // add round key + return vreinterpretq_m128i_u8(w) ^ RoundKey; + +#else /* ARMv7-A NEON implementation */ +#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ + (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ + (b0)) +#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) +#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) +#define SSE2NEON_AES_U0(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) +#define SSE2NEON_AES_U1(p) \ + SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) +#define SSE2NEON_AES_U2(p) \ + SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) +#define SSE2NEON_AES_U3(p) \ + SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) + static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { + SSE2NEON_AES_DATA(SSE2NEON_AES_U0), + SSE2NEON_AES_DATA(SSE2NEON_AES_U1), + SSE2NEON_AES_DATA(SSE2NEON_AES_U2), + SSE2NEON_AES_DATA(SSE2NEON_AES_U3), + }; +#undef SSE2NEON_AES_B2W +#undef SSE2NEON_AES_F2 +#undef SSE2NEON_AES_F3 +#undef SSE2NEON_AES_U0 +#undef SSE2NEON_AES_U1 +#undef SSE2NEON_AES_U2 +#undef SSE2NEON_AES_U3 + + uint32_t x0 = _mm_cvtsi128_si32(EncBlock); + uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); + uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); + uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); + + __m128i out = _mm_set_epi32( + (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ + aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), + (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ + aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), + (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ + aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), + (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ + aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, RoundKey); +#endif +} + +// Perform the last round of an AES encryption flow on data (state) in a using +// the round key in RoundKey, and store the result in dst. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + /* FIXME: optimized for NEON */ + uint8_t v[4][4] = { + [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, + [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, + [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, + [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], + SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, + }; + for (int i = 0; i < 16; i++) + vreinterpretq_nth_u8_m128i(a, i) = + v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); + return a; +} + +// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. +// This instruction generates a round key for AES encryption. See +// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ +// for details. +// +// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) +{ + uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); + uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); + for (int i = 0; i < 4; ++i) { + ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; + ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; + } + return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, + ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); +} +#undef SSE2NEON_AES_DATA + +#else /* __ARM_FEATURE_CRYPTO */ +// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and +// AESMC and then manually applying the real key as an xor operation. This +// unfortunately means an additional xor op; the compiler should be able to +// optimize this away for repeated calls however. See +// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a +// for more details. +FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u8( + vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ + vreinterpretq_u8_m128i(b)); +} + +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 +FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) +{ + return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( + vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), + RoundKey); +} + +FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) +{ + // AESE does ShiftRows and SubBytes on A + uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); + + uint8x16_t dest = { + // Undo ShiftRows step from AESE and extract X1 and X3 + u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) + u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) + u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) + u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) + }; + uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; + return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); +} +#endif + +/* Streaming Extensions */ + +// Guarantees that every preceding store is globally visible before any +// subsequent store. +// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx +FORCE_INLINE void _mm_sfence(void) +{ + __sync_synchronize(); +} + +// Store 64-bits of integer data from a into memory using a non-temporal memory +// hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi +FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) +{ + vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); +} + +// Store 128-bits (composed of 4 packed single-precision (32-bit) floating- +// point elements) from a into memory using a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps +FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#else + vst1q_f32(p, vreinterpretq_f32_m128(a)); +#endif +} + +// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point +// elements) from a into memory using a non-temporal memory hint. mem_addr must +// be aligned on a 16-byte boundary or a general-protection exception may be +// generated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd +FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, (float32x4_t *) p); +#elif defined(__aarch64__) + vst1q_f64(p, vreinterpretq_f64_m128d(a)); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); +#endif +} + +// Stores the data in a to the address p without polluting the caches. If the +// cache line containing address p is already in the cache, the cache will be +// updated. +// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx +FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) +{ +#if __has_builtin(__builtin_nontemporal_store) + __builtin_nontemporal_store(a, p); +#else + vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); +#endif +} + +// Store 32-bit integer a into memory using a non-temporal hint to minimize +// cache pollution. If the cache line containing address mem_addr is already in +// the cache, the cache will be updated. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32 +FORCE_INLINE void _mm_stream_si32(int *p, int a) +{ + vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); +} + +// Load 128-bits of integer data from memory into dst using a non-temporal +// memory hint. mem_addr must be aligned on a 16-byte boundary or a +// general-protection exception may be generated. +// +// dst[127:0] := MEM[mem_addr+127:mem_addr] +// +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 +FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) +{ +#if __has_builtin(__builtin_nontemporal_store) + return __builtin_nontemporal_load(p); +#else + return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); +#endif +} + +// Cache line containing p is flushed and invalidated from all caches in the +// coherency domain. : +// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx +FORCE_INLINE void _mm_clflush(void const *p) +{ + (void) p; + // no corollary for Neon? +} + +// Allocate aligned blocks of memory. +// https://software.intel.com/en-us/ +// cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks +FORCE_INLINE void *_mm_malloc(size_t size, size_t align) +{ + void *ptr; + if (align == 1) + return malloc(size); + if (align == 2 || (sizeof(void *) == 8 && align == 4)) + align = sizeof(void *); + if (!posix_memalign(&ptr, align, size)) + return ptr; + return NULL; +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64 +FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) +{ + int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); + __m128 b = _mm_load_ps((const float *) mem_addr); + int8x8_t masked = + vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), + vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); + vst1_s8((int8_t *) mem_addr, masked); +} + +// Conditionally store 8-bit integer elements from a into memory using mask +// (elements are not stored when the highest bit is not set in the corresponding +// element) and a non-temporal memory hint. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq +#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) + +// Free aligned memory that was allocated with _mm_malloc. +// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free +FORCE_INLINE void _mm_free(void *addr) +{ + free(addr); +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 8-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc ^= v; + for (int bit = 0; bit < 8; bit++) { + if (crc & 1) + crc = (crc >> 1) ^ UINT32_C(0x82f63b78); + else + crc = (crc >> 1); + } +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 16-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u8(crc, v & 0xff); + crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 32-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) +FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u16(crc, v & 0xffff); + crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); +#endif + return crc; +} + +// Starting with the initial value in crc, accumulates a CRC32 value for +// unsigned 64-bit integer v. +// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) +FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) +{ +#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) + __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" + : [c] "+r"(crc) + : [v] "r"(v)); +#else + crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); + crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); +#endif + return crc; +} + +FORCE_INLINE void _mm_empty (void) { } + +#if defined(__GNUC__) || defined(__clang__) +#pragma pop_macro("ALIGN_STRUCT") +#pragma pop_macro("FORCE_INLINE") +#endif + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC pop_options +#endif + +#endif diff --git a/dedicated/wscript b/dedicated/wscript new file mode 100755 index 00000000..485ef288 --- /dev/null +++ b/dedicated/wscript @@ -0,0 +1,68 @@ +#! /usr/bin/env python +# encoding: utf-8 + +from waflib import Utils +import os + +top = '.' +PROJECT_NAME = 'dedicated' + +def options(opt): + # stub + return + +def configure(conf): + conf.define('LAUNCHERONLY',1) +# conf.define('SUPPORT_PACKED_STORE',1) + conf.define('DEDICATED',1) + +def build(bld): + source = [ + 'filesystem.cpp', + '../public/filesystem_init.cpp', + '../common/netapi.cpp', + '../common/SteamAppStartup.cpp', + 'sys_common.cpp', + 'sys_ded.cpp', + #'sys_windows.cpp', [$WINDOWS] + 'sys_linux.cpp', # [$POSIX] + 'console/conproc.cpp', + 'console/textconsole.cpp', + 'console/TextConsoleUnix.cpp', # [$POSIX] + '../filesystem/filetracker.cpp', + '../filesystem/basefilesystem.cpp', + '../filesystem/packfile.cpp', + '../filesystem/filesystem_async.cpp', + '../filesystem/filesystem_stdio.cpp', + '../filesystem/QueuedLoader.cpp', + '../public/zip_utils.cpp', + '../filesystem/linux_support.cpp' # [$POSIX] + ] + + includes = [ + '.', + '../public', + '../public/tier0', + '../public/tier1', + '../common' + ] + bld.env.INCLUDES_SDL2 + + defines = [] + + libs = ['tier0','tier1','tier2','tier3','vstdlib','steam_api','vpklib','appframework','mathlib', 'EDIT'] + + install_path = bld.env.LIBDIR + + bld.shlib( + source = source, + target = PROJECT_NAME, + name = PROJECT_NAME, + features = 'c cxx', + includes = includes, + defines = defines, + use = libs, + install_path = install_path, + subsystem = bld.env.MSVC_SUBSYSTEM, + idx = bld.get_taskgen_count() + ) + diff --git a/dedicated_main/main.cpp b/dedicated_main/main.cpp index 9c64d7aa..de75a373 100644 --- a/dedicated_main/main.cpp +++ b/dedicated_main/main.cpp @@ -209,6 +209,9 @@ int main( int argc, char *argv[] ) const char *pBinaryName = "dedicated" DLL_EXT_STRING; void *dedicated = dlopen( pBinaryName, RTLD_NOW ); + if ( !dedicated ) + dedicated = dlopen( "libdedicated" DLL_EXT_STRING, RTLD_NOW ); + if ( !dedicated ) { printf( "Failed to open %s (%s)\n", pBinaryName, dlerror()); diff --git a/dedicated_main/wscript b/dedicated_main/wscript new file mode 100755 index 00000000..97a5f994 --- /dev/null +++ b/dedicated_main/wscript @@ -0,0 +1,43 @@ +#! /usr/bin/env python +# encoding: utf-8 + +from waflib import Utils +import os + +top = '.' +PROJECT_NAME = 'dedicated_launcher' + +def options(opt): + # stub + return + +def configure(conf): + return + +def build(bld): + + source = ['main.cpp'] + includes = ['../public', '../public/tier0'] + defines = [] + libs = [] + + if bld.env.DEST_OS != 'win32': + libs += [ 'DL' ] + else: + libs += ['USER32', 'SHELL32'] + source += ['dedicated_main.rc'] + + install_path = bld.env.BINDIR + bld( + source = source, + target = PROJECT_NAME, + name = PROJECT_NAME, + features = 'c cxx cxxprogram', + includes = includes, + defines = defines, + use = libs, + install_path = install_path, + subsystem = bld.env.MSVC_SUBSYSTEM, + idx = bld.get_taskgen_count() + ) + diff --git a/engine/sys_dll2.cpp b/engine/sys_dll2.cpp index 30fbe4ae..e6742211 100644 --- a/engine/sys_dll2.cpp +++ b/engine/sys_dll2.cpp @@ -1175,7 +1175,7 @@ InitReturnVal_t CEngineAPI::Init() m_bRunningSimulation = false; // Initialize the FPU control word -#if defined(WIN32) && !defined( SWDS ) && !defined( _X360 ) +#if defined(WIN32) && !defined( SWDS ) && !defined( _X360 ) && !defined (__arm__) _asm { fninit diff --git a/engine/sys_engine.cpp b/engine/sys_engine.cpp index c75abcff..d9083778 100644 --- a/engine/sys_engine.cpp +++ b/engine/sys_engine.cpp @@ -39,6 +39,9 @@ // memdbgon must be the last include file in a .cpp file!!! #include "tier0/memdbgon.h" +#ifdef POSIX +#include +#endif //----------------------------------------------------------------------------- // Forward declarations @@ -366,7 +369,11 @@ void CEngine::Frame( void ) for (int i = 2000; i >= 0; --i) { #if defined(POSIX) +#ifdef __arm__ + raise(SIGINT); +#else __asm( "pause" ); __asm( "pause" ); __asm( "pause" ); __asm( "pause" ); +#endif #elif defined(IS_WINDOWS_PC) _asm { pause }; _asm { pause }; _asm { pause }; _asm { pause }; #endif diff --git a/engine/wscript b/engine/wscript index ce4dc481..2881101e 100755 --- a/engine/wscript +++ b/engine/wscript @@ -12,6 +12,10 @@ def options(opt): return def configure(conf): + if conf.options.DEDICATED: + conf.define('SWDS', 1) + conf.define('NO_STEAM', 1) + conf.env.append_unique('DEFINES',[ '__USEA3D', '_ADD_EAX_', @@ -22,50 +26,18 @@ def configure(conf): def build(bld): source = [ - 'client_pch.cpp', ##[!$DEDICATED] - 'cl_rcon.cpp', ##[!$DEDICATED] 'socketcreator.cpp', - 'rpt_engine.cpp', ##[!$DEDICATED] - 'cl_steamauth.cpp',# #[!$DEDICATED] 'clientframe.cpp', 'decal_clip.cpp', 'demofile.cpp', 'DevShotGenerator.cpp', 'OcclusionSystem.cpp', 'tmessage.cpp', - 'r_efx.cpp',# #[!$DEDICATED] - 'view.cpp', ##[!$DEDICATED] 'baseclient.cpp', 'baseclientstate.cpp', 'cbenchmark.cpp', - 'cdll_engine_int.cpp', ##[!$DEDICATED] / - 'cl_main.cpp',##[!$DEDICATED] / - 'cl_demo.cpp',##[!$DEDICATED] / - #'cl_null.cpp', [$DEDICATED] / - 'cl_demoaction.cpp', #[!$DEDICATED] / - 'cl_demoaction_types.cpp', #[!$DEDICATED] / - 'cl_demoactioneditors.cpp', #[!$DEDICATED] / - 'cl_demoactionmanager.cpp', #[!$DEDICATED] / - 'cl_demoeditorpanel.cpp', #[!$DEDICATED] / - 'cl_demosmootherpanel.cpp', #[!$DEDICATED] / - 'cl_demouipanel.cpp', #[!$DEDICATED] / - 'cl_foguipanel.cpp', #[!$DEDICATED] / - 'cl_txviewpanel.cpp', #[!$DEDICATED] / - 'cl_entityreport.cpp', #[!$DEDICATED] / - 'cl_ents_parse.cpp', #[!$DEDICATED] / - 'cl_localnetworkbackdoor.cpp', #[!$DEDICATED] / - 'cl_parse_event.cpp', #[!$DEDICATED] / - 'cl_pluginhelpers.cpp',#[!$DEDICATED] / - 'cl_pred.cpp', #[!$DEDICATED] / - 'cl_texturelistpanel.cpp', #[!$DEDICATED] / - 'client.cpp',#[!$DEDICATED] / - 'colorcorrectionpanel.cpp', #[!$DEDICATED] / 'console.cpp', 'render_pch.cpp', - - 'buildcubemaps.cpp', #[!$DEDICATED] / - 'debug_leafvis.cpp', #[!$DEDICATED] / - 'debugoverlay.cpp', #[!$DEDICATED] / 'decals.cpp', 'disp.cpp', 'disp_interface.cpp', @@ -73,32 +45,19 @@ def build(bld): 'gl_draw.cpp', 'gl_rsurf.cpp', 'gl_shader.cpp', - 'gl_drawlights.cpp', #[!$DEDICATED] / - 'gl_lightmap.cpp', #[!$DEDICATED] / - 'gl_matsysiface.cpp', #[!$DEDICATED] / - 'gl_rlight.cpp',#[!$DEDICATED] / - 'gl_rmain.cpp',#[!$DEDICATED] / - 'gl_rmisc.cpp',#[!$DEDICATED] / - 'gl_screen.cpp',#[!$DEDICATED] / - 'gl_warp.cpp',#[!$DEDICATED] / 'l_studio.cpp', 'matsys_interface.cpp', 'modelloader.cpp', 'Overlay.cpp', - 'r_areaportal.cpp', #[!$DEDICATED] / 'r_decal.cpp', 'r_linefile.cpp', - 'shadowmgr.cpp',#[!$DEDICATED] 'server_pch.cpp', - - 'sv_ipratelimit.cpp', 'sv_rcon.cpp', 'sv_steamauth.cpp', 'sv_uploaddata.cpp', 'sv_uploadgamestats.cpp', 'vengineserver_impl.cpp', - 'sv_main.cpp', 'sv_client.cpp', 'sv_ents_write.cpp', @@ -110,7 +69,6 @@ def build(bld): 'sv_precache.cpp', 'sv_redirect.cpp', 'sv_remoteaccess.cpp', - 'baseautocompletefilelist.cpp', 'baseserver.cpp', 'bitbuf_errorhandler.cpp', @@ -121,7 +79,6 @@ def build(bld): 'checksum_engine.cpp', 'ccs.cpp', 'clockdriftmgr.cpp', - 'cl_bounded_cvars.cpp', ##[!$DEDICATED] 'cl_check_process.cpp', 'cmd.cpp', 'cmodel.cpp', @@ -137,7 +94,6 @@ def build(bld): '../public/disp_powerinfo.cpp', '../public/dispcoll_common.cpp', 'DownloadListGenerator.cpp', - 'downloadthread.cpp', ##[!$DEDICATED] 'dt.cpp', 'dt_common_eng.cpp', 'dt_encode.cpp', @@ -178,14 +134,8 @@ def build(bld): 'initmathlib.cpp', '../common/language.cpp', 'LocalNetworkBackdoor.cpp', - 'LoadScreenUpdate.cpp', #[!$DEDICATED] '../public/lumpfiles.cpp', 'MapReslistGenerator.cpp', - 'matchmakinghost.cpp', #[!$DEDICATED] - 'matchmakingqos.cpp', #[!$DEDICATED] - 'matchmakingclient.cpp', #[!$DEDICATED] - 'matchmakingshared.cpp', #[!$DEDICATED] - 'matchmakingmigrate.cpp', #[!$DEDICATED] 'materialproxyfactory.cpp', 'mem_fgets.cpp', 'mod_vis.cpp', @@ -211,10 +161,8 @@ def build(bld): 'engine_replay_int.cpp', 'replay_internal.cpp', 'replaydemo.cpp', - 'replaydemoplayer.cpp', #[!$DEDICATED] 'replayserver.cpp', '../public/sentence.cpp', - 'Session.cpp', #[!$DEDICATED] 'sound_shared.cpp', 'spatialpartition.cpp', 'staticpropmgr.cpp', @@ -222,7 +170,6 @@ def build(bld): 'sys_dll.cpp', 'sys_dll2.cpp', 'sys_engine.cpp', - 'sys_mainwind.cpp', #[!$DEDICATED] 'sys_linuxwind.cpp', #[$POSIX] 'testscriptmgr.cpp', 'traceinit.cpp', @@ -234,56 +181,22 @@ def build(bld): '../public/XZip.cpp', '../public/XUnzip.cpp', 'zone.cpp', - 'bugreporter.cpp', #[!$DEDICATED] 'cheatcodes.cpp', - 'download.cpp', #[!$DEDICATED] '../public/editor_sendcommand.cpp', - 'host_saverestore.cpp', #[!$DEDICATED] 'keys.cpp', - 'lightcache.cpp', #[!$DEDICATED] 'networkstringtableclient.cpp', - 'saverestore_filesystem.cpp', #[!$DEDICATED] '../public/scratchpad3d.cpp', 'servermsghandler.cpp', - 'sys_getmodes.cpp', #[!$DEDICATED] - 'vgui_askconnectpanel.cpp', #[!$DEDICATED] - 'xboxsystem.cpp', #[!$DEDICATED] '../common/SourceAppInfo.cpp', 'snd_io.cpp', 'EngineSoundServer.cpp', - 'EngineSoundClient.cpp', #[!$DEDICATED] - 'engsoundservice.cpp', #[!$DEDICATED] 'audio/private/voice_wavefile.cpp', - 'audio/private/MPAFile.cpp', #[!$DEDICATED&&!$X360] - 'audio/private/MPAHeader.cpp', #[!$DEDICATED&&!$X360] - 'audio/private/circularbuffer.cpp', #[!$DEDICATED] - 'audio/private/snd_posix.cpp', # [$POSIX] - - 'audio/audio_pch.cpp', #[!$DEDICATED] - 'audio/private/vox.cpp', - 'audio/private/snd_dev_common.cpp', #[!$DEDICATED] / - 'audio/private/snd_dma.cpp', #[!$DEDICATED] / - 'audio/private/snd_dsp.cpp', #[!$DEDICATED] / - 'audio/private/snd_mix.cpp', #[!$DEDICATED] / - 'audio/private/snd_sentence_mixer.cpp', #[!$DEDICATED]/ - 'audio/private/snd_wave_data.cpp', #[!$DEDICATED] / - 'audio/private/snd_wave_mixer.cpp', #[!$DEDICATED] / - 'audio/private/snd_wave_mixer_adpcm.cpp', #[!$DEDICATED] / - 'audio/private/snd_wave_source.cpp', #[!$DEDICATED] / - 'audio/private/snd_wave_temp.cpp', #[!$DEDICATED] / - 'audio/private/snd_win.cpp', #[!$DEDICATED] / - 'audio/private/voice_gain.cpp', - + 'audio/private/snd_posix.cpp', # [$POSIX] #'audio/private/snd_dev_direct.cpp', [$WINDOWS]/ #'audio/private/snd_dev_wave.cpp', [$WINDOWS]/ - 'audio/private/snd_mp3_source.cpp', #[!$DEDICATED]/ - 'audio/private/snd_wave_mixer_mp3.cpp', #[!$DEDICATED] / - 'audio/private/VBRHeader.cpp', # [!$DEDICATED&&!$X360]/ - 'audio/private/voice.cpp', #[!$DEDICATED&&!$X360]/ #'audio/private/voice_mixer_controls.cpp', [$WINDOWS] / #'audio/private/voice_record_dsound.cpp', [$WINDOWS] / - 'audio/private/voice_sound_engine_interface.cpp', #[!$DEDICATED&&!$X360] #'audio/private/snd_dev_xaudio.cpp',[$X360] #'audio/private/snd_wave_mixer_xma.cpp', [$X360] @@ -291,29 +204,116 @@ def build(bld): 'audio/private/snd_dev_sdl.cpp', #[$SDL && !$OSXALL] #'audio/private/snd_dev_openal.cpp', # [$OSXALL] #'audio/private/snd_dev_mac_audioqueue.cpp',# [$OSXALL] - 'audio/private/voice_mixer_controls_openal.cpp', #[$OSXALL||($LINUXALL&&!$DEDICATED)] - 'audio/private/voice_record_openal.cpp', #[$OSXALL||($LINUXALL&&!$DEDICATED)] #'audio/private/voice_record_mac_audioqueue.cpp', #[$OSXALL] - - '../public/vgui_controls/vgui_controls.cpp', - '../common/vgui/vgui_basebudgetpanel.cpp', - '../common/vgui/vgui_budgetbargraphpanel.cpp', - '../common/vgui/vgui_budgethistorypanel.cpp', - '../common/vgui/vgui_budgetpanelshared.cpp', - 'perfuipanel.cpp', - 'vgui_basepanel.cpp', - 'vgui_baseui_interface.cpp', - 'vgui_budgetpanel.cpp', - 'vgui_DebugSystemPanel.cpp', - 'vgui_drawtreepanel.cpp', - 'vgui_helpers.cpp', - 'vgui_texturebudgetpanel.cpp', - 'vgui_vprofgraphpanel.cpp', - 'vgui_vprofpanel.cpp', - 'enginetool.cpp', #[!$DEDICATED] - 'toolframework.cpp' ] + if bld.env.DEDICATED: + source += ['cl_null.cpp'] + else: + source += [ + 'client_pch.cpp', + 'cl_rcon.cpp', + 'r_efx.cpp', + 'view.cpp', + 'rpt_engine.cpp', + 'cl_steamauth.cpp', + 'cdll_engine_int.cpp', + 'cl_main.cpp', + 'cl_demo.cpp', + 'cl_demoaction.cpp', + 'cl_demoaction_types.cpp', + 'cl_demoactioneditors.cpp', + 'cl_demoactionmanager.cpp', + 'cl_demoeditorpanel.cpp', + 'cl_demosmootherpanel.cpp', + 'cl_demouipanel.cpp', + 'cl_foguipanel.cpp', + 'cl_txviewpanel.cpp', + 'cl_entityreport.cpp', + 'cl_ents_parse.cpp', + 'cl_localnetworkbackdoor.cpp', + 'cl_parse_event.cpp', + 'cl_pluginhelpers.cpp', + 'cl_pred.cpp', + 'cl_texturelistpanel.cpp', + 'client.cpp', + 'colorcorrectionpanel.cpp', + 'buildcubemaps.cpp', + 'debug_leafvis.cpp', + 'debugoverlay.cpp', + 'LoadScreenUpdate.cpp', + 'enginetool.cpp', + 'gl_drawlights.cpp', + 'gl_lightmap.cpp', + 'gl_matsysiface.cpp', + 'gl_rlight.cpp', + 'gl_rmain.cpp', + 'gl_rmisc.cpp', + 'gl_screen.cpp', + 'gl_warp.cpp', + 'r_areaportal.cpp', + 'shadowmgr.cpp', + 'cl_bounded_cvars.cpp', + 'downloadthread.cpp', + 'matchmakinghost.cpp', + 'matchmakingqos.cpp', + 'matchmakingclient.cpp', + 'matchmakingshared.cpp', + 'matchmakingmigrate.cpp', + 'replaydemoplayer.cpp', + 'Session.cpp', + 'sys_mainwind.cpp', + 'bugreporter.cpp', + 'download.cpp', + 'host_saverestore.cpp', + 'lightcache.cpp', + 'saverestore_filesystem.cpp', + 'sys_getmodes.cpp', + 'vgui_askconnectpanel.cpp', + 'xboxsystem.cpp', + 'audio/audio_pch.cpp', + 'EngineSoundClient.cpp', + 'engsoundservice.cpp', + 'audio/private/MPAFile.cpp', #[!$X360] + 'audio/private/MPAHeader.cpp', #[!$X360] + 'audio/private/circularbuffer.cpp', + 'audio/private/snd_dev_common.cpp', + 'audio/private/snd_dma.cpp', + 'audio/private/snd_dsp.cpp', + 'audio/private/snd_mix.cpp', + 'audio/private/snd_sentence_mixer.cpp', + 'audio/private/snd_wave_data.cpp', + 'audio/private/snd_wave_mixer.cpp', + 'audio/private/snd_wave_mixer_adpcm.cpp', + 'audio/private/snd_wave_source.cpp', + 'audio/private/snd_wave_temp.cpp', + 'audio/private/snd_win.cpp', + 'audio/private/voice_gain.cpp', + 'audio/private/snd_mp3_source.cpp', + 'audio/private/snd_wave_mixer_mp3.cpp', + 'audio/private/VBRHeader.cpp', #[!$X360] + 'audio/private/voice.cpp', #[!$X360] + 'audio/private/voice_sound_engine_interface.cpp', #[!$X360] + 'audio/private/voice_mixer_controls_openal.cpp', #[$OSXALL||$LINUXALL] + 'audio/private/voice_record_openal.cpp', #[$OSXALL||$LINUXALL] + '../public/vgui_controls/vgui_controls.cpp', + '../common/vgui/vgui_basebudgetpanel.cpp', + '../common/vgui/vgui_budgetbargraphpanel.cpp', + '../common/vgui/vgui_budgethistorypanel.cpp', + '../common/vgui/vgui_budgetpanelshared.cpp', + 'perfuipanel.cpp', + 'vgui_basepanel.cpp', + 'vgui_baseui_interface.cpp', + 'vgui_budgetpanel.cpp', + 'vgui_DebugSystemPanel.cpp', + 'vgui_drawtreepanel.cpp', + 'vgui_helpers.cpp', + 'vgui_texturebudgetpanel.cpp', + 'vgui_vprofgraphpanel.cpp', + 'vgui_vprofpanel.cpp', + 'toolframework.cpp' + ] + includes = [ '.', '../public', @@ -327,7 +327,7 @@ def build(bld): defines = [] - libs = ['tier0','vgui_controls','dmxloader','tier1','tier2','tier3','bitmap','vstdlib','appframework','datamodel','vtf','mathlib','steam_api','matsys_controls','BZIP2','SDL2','JPEG','ZLIB','OPENAL','CURL'] + libs = ['tier0','vgui_controls','dmxloader','tier1','tier2','tier3','bitmap','vstdlib','appframework','datamodel','vtf','mathlib','steam_api','matsys_controls','BZ2','SDL2','JPEG','ZLIB','OPENAL','CURL'] install_path = bld.env.LIBDIR diff --git a/filesystem/basefilesystem.cpp b/filesystem/basefilesystem.cpp index b154b70e..4423cb4c 100644 --- a/filesystem/basefilesystem.cpp +++ b/filesystem/basefilesystem.cpp @@ -5054,7 +5054,6 @@ CSysModule *CBaseFileSystem::LoadModule( const char *pFileName, const char *pPat #ifdef POSIX Q_snprintf( tempPathID, sizeof(tempPathID), "%slib%s", m_SearchPaths[i].GetPathString(), pFileName ); // append the path to this dir. - printf(tempPathID); pModule = Sys_LoadModule( tempPathID ); if ( pModule ) return pModule; diff --git a/inputsystem/wscript b/inputsystem/wscript index 3ff6305d..fe090fbf 100755 --- a/inputsystem/wscript +++ b/inputsystem/wscript @@ -27,9 +27,8 @@ def build(bld): '.', '../common', '../public', - '../public/tier0', - '../thirdparty/SDL2' - ] + '../public/tier0' + ] defines = [] diff --git a/launcher_main/main.cpp b/launcher_main/main.cpp index 22d1a779..6e20ba50 100644 --- a/launcher_main/main.cpp +++ b/launcher_main/main.cpp @@ -216,12 +216,13 @@ static void WaitForDebuggerConnect( int argc, char *argv[], int time ) int main( int argc, char *argv[] ) { void *launcher = dlopen( "bin/liblauncher" DLL_EXT_STRING, RTLD_NOW ); + fprintf( stderr, "%s\nFailed to load the launcher\n", dlerror() ); if( !launcher ) - void *launcher = dlopen( "bin/launcher" DLL_EXT_STRING, RTLD_NOW ); + launcher = dlopen( "bin/launcher" DLL_EXT_STRING, RTLD_NOW ); if ( !launcher ) { - fprintf( stderr, "Failed to load the launcher\n" ); + fprintf( stderr, "%s\nFailed to load the launcher\n", dlerror() ); return 0; } diff --git a/materialsystem/shaderapiempty/wscript b/materialsystem/shaderapiempty/wscript new file mode 100755 index 00000000..3dab5e41 --- /dev/null +++ b/materialsystem/shaderapiempty/wscript @@ -0,0 +1,52 @@ +#! /usr/bin/env python +# encoding: utf-8 + +from waflib import Utils +import os + +top = '.' +PROJECT_NAME = 'shaderapiempty' + +def options(opt): + # stub + return + +def configure(conf): + conf.env.append_unique('DEFINES',[ + 'SHADER_DLL_EXPORT', + 'PROTECTED_THINGS_ENABLE' + ]) + +def build(bld): + source = [ + 'shaderapiempty.cpp' + ] + + includes = [ + '.', + '../../public', + '../../public/tier0', + '../../public/tier1', + '../../common', + '../' + ] + bld.env.INCLUDES_SDL2 + + defines = [] + + libs = ['tier0','tier1'] + + install_path = bld.env.LIBDIR + + bld.shlib( + source = source, + target = PROJECT_NAME, + name = PROJECT_NAME, + features = 'c cxx', + includes = includes, + defines = defines, + use = libs, + install_path = install_path, + subsystem = bld.env.MSVC_SUBSYSTEM, + idx = bld.get_taskgen_count() + ) + diff --git a/mathlib/sse.cpp b/mathlib/sse.cpp index 018a7a5b..83dda7d9 100644 --- a/mathlib/sse.cpp +++ b/mathlib/sse.cpp @@ -11,6 +11,10 @@ #include "tier0/dbg.h" #include "mathlib/mathlib.h" #include "mathlib/vector.h" +#ifdef __arm__ +#include "sse2neon.h" +#endif + #include "sse.h" // memdbgon must be the last include file in a .cpp file!!! @@ -176,7 +180,9 @@ float _SSE_RSqrtFast(float x) Assert( s_bMathlibInitialized ); float rroot; -#ifdef _WIN32 +#ifdef __arm__ + rroot = _SSE_RSqrtAccurate(x); +#elif _WIN32 _asm { rsqrtss xmm0, x @@ -204,16 +210,19 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec) #endif float *v = &vec[0]; -#ifdef _WIN32 float *r = &result[0]; -#endif float radius = 0.f; // Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't // be much of a performance win, considering you will very likely miss 3 branch predicts in a row. if ( v[0] || v[1] || v[2] ) { -#ifdef _WIN32 +#ifdef __arm__ + float rsqrt = _SSE_RSqrtAccurate( v[0] * v[0] + v[1] * v[1] + v[2] * v[2] ); + r[0] = v[0] * rsqrt; + r[1] = v[1] * rsqrt; + r[2] = v[2] * rsqrt; +#elif _WIN32 _asm { mov eax, v @@ -287,7 +296,9 @@ void FASTCALL _SSE_VectorNormalizeFast (Vector& vec) float _SSE_InvRSquared(const float* v) { float inv_r2 = 1.f; -#ifdef _WIN32 +#ifdef __arm__ + return _SSE_RSqrtAccurate( FLT_EPSILON + v[0] * v[0] + v[1] * v[1] + v[2] * v[2] ); +#elif _WIN32 _asm { // Intel SSE only routine mov eax, v movss xmm5, inv_r2 // x5 = 1.0, 0, 0, 0 @@ -380,7 +391,14 @@ typedef __m64 v2si; // vector of 2 int (mmx) void _SSE_SinCos(float x, float* s, float* c) { -#ifdef _WIN32 +#ifdef __arm__ +#if defined( POSIX ) + sincosf(x, s, c); +#else + *s = sin( x ); + *c = cos( x ); +#endif +#elif _WIN32 float t4, t8, t12; __asm @@ -587,7 +605,9 @@ void _SSE_SinCos(float x, float* s, float* c) float _SSE_cos( float x ) { -#ifdef _WIN32 +#ifdef __arm__ + return cos(x); +#elif _WIN32 float temp; __asm { diff --git a/public/bitmap/imageformat.h b/public/bitmap/imageformat.h index 196397e0..3a982514 100644 --- a/public/bitmap/imageformat.h +++ b/public/bitmap/imageformat.h @@ -28,7 +28,9 @@ typedef enum _D3DFORMAT D3DFORMAT; //----------------------------------------------------------------------------- // don't bitch that inline functions aren't used!!!! +#ifdef _WIN32 #pragma warning(disable : 4514) +#endif enum ImageFormat { diff --git a/public/materialsystem/imesh.h b/public/materialsystem/imesh.h index 3da443b4..5f23f24e 100644 --- a/public/materialsystem/imesh.h +++ b/public/materialsystem/imesh.h @@ -1156,6 +1156,9 @@ inline void CVertexBuilder::FastAdvanceNVertices( int n ) //----------------------------------------------------------------------------- inline void CVertexBuilder::FastVertex( const ModelVertexDX7_t &vertex ) { +#ifdef __arm__ + FastVertexSSE( vertex ); +#else Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed Assert( m_nCurrentVertex < m_nMaxVertexCount ); @@ -1213,6 +1216,7 @@ inline void CVertexBuilder::FastVertex( const ModelVertexDX7_t &vertex ) m_bWrittenNormal = false; m_bWrittenUserData = false; #endif +#endif } inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX7_t &vertex ) @@ -1322,6 +1326,9 @@ inline void CVertexBuilder::Fast4VerticesSSE( inline void CVertexBuilder::FastVertex( const ModelVertexDX8_t &vertex ) { +#ifdef __arm__ + FastVertexSSE( vertex ); +#else Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed Assert( m_nCurrentVertex < m_nMaxVertexCount ); @@ -1386,8 +1393,10 @@ inline void CVertexBuilder::FastVertex( const ModelVertexDX8_t &vertex ) m_bWrittenNormal = false; m_bWrittenUserData = false; #endif +#endif } + inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex ) { Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed @@ -1414,7 +1423,8 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex ) #elif defined(GNUC) const void *pRead = &vertex; void *pCurrPos = m_pCurrPosition; - __asm__ __volatile__ ( + +/* __asm__ __volatile__ ( "movaps (%0), %%xmm0\n" "movaps 16(%0), %%xmm1\n" "movaps 32(%0), %%xmm2\n" @@ -1422,8 +1432,17 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex ) "movntps %%xmm0, (%1)\n" "movntps %%xmm1, 16(%1)\n" "movntps %%xmm2, 32(%1)\n" - "movntps %%xmm3, 48(%1)\n" - :: "r" (pRead), "r" (pCurrPos) : "memory"); + "movntps %%xmm3, 48(%1)\n" + :: "r" (pRead), "r" (pCurrPos) : "memory"); */ + + __m128 m1 = _mm_load_ps( (float *)pRead ); + __m128 m2 = _mm_load_ps( (float *)(pRead + 16) ); + __m128 m3 = _mm_load_ps( (float *)(pRead + 32) ); + __m128 m4 = _mm_load_ps( (float *)(pRead + 48) ); + _mm_stream_ps( (float *)pCurrPos, m1 ); + _mm_stream_ps( (float *)(pCurrPos + 16), m2 ); + _mm_stream_ps( (float *)(pCurrPos + 32), m3 ); + _mm_stream_ps( (float *)(pCurrPos + 48), m4 ); #else Error( "Implement CMeshBuilder::FastVertexSSE((dx8)" ); #endif diff --git a/public/mathlib/mathlib.h b/public/mathlib/mathlib.h index a6d302ff..fe103e5e 100644 --- a/public/mathlib/mathlib.h +++ b/public/mathlib/mathlib.h @@ -458,11 +458,7 @@ void inline SinCos( float radians, float *sine, float *cosine ) *sine = sin( radians ); *cosine = cos( radians ); #elif defined( POSIX ) - double __cosr, __sinr; - __asm ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians)); - - *sine = __sinr; - *cosine = __cosr; + sincosf(radians, sine, cosine); #endif } @@ -1217,6 +1213,8 @@ FORCEINLINE int RoundFloatToInt(float f) }; flResult = __fctiw( f ); return pResult[1]; +#elif defined (__arm__) + return (int)(f + 0.5f); #else #error Unknown architecture #endif @@ -1247,8 +1245,9 @@ FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f) Assert( pIntResult[1] >= 0 ); return pResult[1]; #else // !X360 - -#if defined( PLATFORM_WINDOWS_PC64 ) +#ifdef __arm__ + return (unsigned long)(f + 0.5f); +#elif defined( PLATFORM_WINDOWS_PC64 ) uint nRet = ( uint ) f; if ( nRet & 1 ) { diff --git a/public/mathlib/ssemath.h b/public/mathlib/ssemath.h index c2ff48d7..d5617c4b 100644 --- a/public/mathlib/ssemath.h +++ b/public/mathlib/ssemath.h @@ -8,6 +8,8 @@ #if defined( _X360 ) #include +#elif defined(__arm__) +#include "sse2neon.h" #else #include #endif @@ -21,7 +23,7 @@ #define USE_STDC_FOR_SIMD 0 #endif -#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0)) +#if (!defined (__arm__) && !defined(_X360) && (USE_STDC_FOR_SIMD == 0)) #define _SSE1 1 #endif diff --git a/public/mathlib/vector.h b/public/mathlib/vector.h index c7654ba8..c763a3e1 100644 --- a/public/mathlib/vector.h +++ b/public/mathlib/vector.h @@ -22,7 +22,8 @@ // For rand(). We really need a library! #include -#ifndef _X360 +#if defined(__SSE__) || defined(_M_IX86_FP) +#define USE_SSE // For MMX intrinsics #include #endif @@ -209,10 +210,9 @@ private: FORCEINLINE void NetworkVarConstruct( Vector &v ) { v.Zero(); } - -#define USE_M64S ( ( !defined( _X360 ) ) ) - - +#ifdef USE_SSE +#define USE_M64S +#endif //========================================================= // 4D Short Vector (aligned on 8-byte boundary) diff --git a/public/mathlib/vector4d.h b/public/mathlib/vector4d.h index 2b20c882..d63cf52b 100644 --- a/public/mathlib/vector4d.h +++ b/public/mathlib/vector4d.h @@ -16,7 +16,7 @@ #include #include // For rand(). We really need a library! #include -#if !defined( _X360 ) +#if defined(__SSE__) || defined(_M_IX86_FP) #include // For SSE #endif #include "basetypes.h" // For vec_t, put this somewhere else? @@ -141,8 +141,10 @@ public: inline void Set( vec_t X, vec_t Y, vec_t Z, vec_t W ); inline void InitZero( void ); +#ifndef __arm__ inline __m128 &AsM128() { return *(__m128*)&x; } inline const __m128 &AsM128() const { return *(const __m128*)&x; } +#endif private: // No copy constructors allowed if we're in optimal mode @@ -613,8 +615,10 @@ inline void Vector4DAligned::Set( vec_t X, vec_t Y, vec_t Z, vec_t W ) } inline void Vector4DAligned::InitZero( void ) -{ -#if !defined( _X360 ) +{ +#if defined (__arm__) + x = y = z = w = 0; +#elif !defined( _X360 ) this->AsM128() = _mm_set1_ps( 0.0f ); #else this->AsM128() = __vspltisw( 0 ); @@ -625,7 +629,7 @@ inline void Vector4DAligned::InitZero( void ) inline void Vector4DMultiplyAligned( Vector4DAligned const& a, Vector4DAligned const& b, Vector4DAligned& c ) { Assert( a.IsValid() && b.IsValid() ); -#if !defined( _X360 ) +#if !defined( _X360 ) || defined (__arm__) c.x = a.x * b.x; c.y = a.y * b.y; c.z = a.z * b.z; @@ -639,7 +643,7 @@ inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAli { Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) ); -#if !defined( _X360 ) +#if !defined( _X360 ) || defined (__arm__) vOutA.x += vInA.x * w; vOutA.y += vInA.y * w; vOutA.z += vInA.z * w; @@ -660,6 +664,7 @@ inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAli #endif } +#ifndef __arm__ inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB ) { Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) ); @@ -681,6 +686,7 @@ inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4D vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() ); #endif } +#endif #endif // VECTOR4D_H diff --git a/public/saverestoretypes.h b/public/saverestoretypes.h index 491a96e4..da06298e 100644 --- a/public/saverestoretypes.h +++ b/public/saverestoretypes.h @@ -512,27 +512,25 @@ inline const char *CSaveRestoreSegment::StringFromSymbol( int token ) /// compilers. Either way, there's no portable intrinsic. // Newer GCC versions provide this in this header, older did by default. -#if !defined( _rotr ) && defined( COMPILER_GCC ) +#if !defined( _rotr ) && defined( COMPILER_GCC ) && !defined( __arm__ ) #include #endif -#ifdef COMPILER_CLANG -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) -_rotr(unsigned int _Value, int _Shift) { - _Shift &= 0x1f; - return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value; +#if !defined ( _rotr ) +inline unsigned _rotr(unsigned x, unsigned n) { + return (x >> n % 32) | (x << (32-n) % 32); } #endif - inline unsigned int CSaveRestoreSegment::HashString( const char *pszToken ) { COMPILE_TIME_ASSERT( sizeof( unsigned int ) == 4 ); unsigned int hash = 0; while ( *pszToken ) + { hash = _rotr( hash, 4 ) ^ *pszToken++; - + } return hash; } diff --git a/public/tier0/platform.h b/public/tier0/platform.h index 63871391..3da32c7c 100644 --- a/public/tier0/platform.h +++ b/public/tier0/platform.h @@ -713,7 +713,7 @@ typedef void * HINSTANCE; // When we port to 64 bit, we'll have to resolve the int, ptr vs size_t 32/64 bit problems... -#if !defined( _WIN64 ) +#if !defined( _WIN64 ) && defined( _WIN32 ) #pragma warning( disable : 4267 ) // conversion from 'size_t' to 'int', possible loss of data #pragma warning( disable : 4311 ) // pointer truncation from 'char *' to 'int' #pragma warning( disable : 4312 ) // conversion from 'unsigned int' to 'memhandle_t' of greater size @@ -825,9 +825,9 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT) #endif #endif - +#elif defined (__arm__) + inline void SetupFPUControlWord() {} #else - inline void SetupFPUControlWord() { __volatile unsigned short int __cw; @@ -849,7 +849,7 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT) { double flResult; int pResult[2]; - }; + } flResult = __fctiw( f ); return ( pResult[1] == 1 ); } @@ -1160,7 +1160,11 @@ PLATFORM_INTERFACE struct tm * Plat_localtime( const time_t *timep, struct tm * inline uint64 Plat_Rdtsc() { -#if defined( _X360 ) +#if defined( __arm__ ) && defined (POSIX) + struct timespec t; + clock_gettime( CLOCK_REALTIME, &t); + return t.tv_sec * 1000000000ULL + t.tv_nsec; +#elif defined( _X360 ) return ( uint64 )__mftb32(); #elif defined( _WIN64 ) return ( uint64 )__rdtsc(); diff --git a/public/tier0/threadtools.h b/public/tier0/threadtools.h index b0b9b1d2..f5328699 100644 --- a/public/tier0/threadtools.h +++ b/public/tier0/threadtools.h @@ -25,6 +25,7 @@ #ifdef POSIX #include #include +#include #define WAIT_OBJECT_0 0 #define WAIT_TIMEOUT 0x00000102 #define WAIT_FAILED -1 @@ -141,9 +142,11 @@ inline void ThreadPause() #if defined( PLATFORM_WINDOWS_PC ) // Intrinsic for __asm pause; from _mm_pause(); -#elif POSIX +#elif POSIX && defined( __i386__ ) __asm __volatile( "pause" ); #elif defined( _X360 ) +#elif defined(__arm__) + sched_yield(); #else #error "implement me" #endif diff --git a/public/tier1/convar.h b/public/tier1/convar.h index 5043b6bc..f9f95355 100644 --- a/public/tier1/convar.h +++ b/public/tier1/convar.h @@ -627,7 +627,9 @@ void ConVar_PrintDescription( const ConCommandBase *pVar ); //----------------------------------------------------------------------------- // Purpose: Utility class to quickly allow ConCommands to call member methods //----------------------------------------------------------------------------- +#ifdef _WIN32 #pragma warning (disable : 4355 ) +#endif template< class T > class CConCommandMemberAccessor : public ConCommand, public ICommandCallback, public ICommandCompletionCallback @@ -674,8 +676,9 @@ private: FnMemberCommandCompletionCallback_t m_CompletionFunc; }; +#ifdef _WIN32 #pragma warning ( default : 4355 ) - +#endif //----------------------------------------------------------------------------- // Purpose: Utility macros to quicky generate a simple console command diff --git a/public/tier1/utlblockmemory.h b/public/tier1/utlblockmemory.h index e7e06aeb..35ef2da2 100644 --- a/public/tier1/utlblockmemory.h +++ b/public/tier1/utlblockmemory.h @@ -21,8 +21,10 @@ #include "tier0/memalloc.h" #include "tier0/memdbgon.h" +#ifdef _WIN32 #pragma warning (disable:4100) #pragma warning (disable:4514) +#endif //----------------------------------------------------------------------------- diff --git a/public/tier1/utlfixedmemory.h b/public/tier1/utlfixedmemory.h index 6ff8c19b..c051bdae 100644 --- a/public/tier1/utlfixedmemory.h +++ b/public/tier1/utlfixedmemory.h @@ -20,8 +20,10 @@ #include "tier0/memalloc.h" #include "tier0/memdbgon.h" +#ifdef _WIN32 #pragma warning (disable:4100) #pragma warning (disable:4514) +#endif //----------------------------------------------------------------------------- diff --git a/public/tier1/utllinkedlist.h b/public/tier1/utllinkedlist.h index 37ecef6b..dda8162d 100644 --- a/public/tier1/utllinkedlist.h +++ b/public/tier1/utllinkedlist.h @@ -550,8 +550,10 @@ inline I CUtlLinkedList::PrivateNext( I i ) const // Are nodes in the list or valid? //----------------------------------------------------------------------------- +#ifdef _WIN32 #pragma warning(push) #pragma warning( disable: 4310 ) // Allows "(I)(S)M::INVALID_INDEX" below +#endif template inline bool CUtlLinkedList::IndexInRange( I index ) // Static method { @@ -570,7 +572,9 @@ inline bool CUtlLinkedList::IndexInRange( I index ) // Static method return ( ( (S)index == index ) && ( (S)index != InvalidIndex() ) ); } +#ifdef _WIN32 #pragma warning(pop) +#endif template inline bool CUtlLinkedList::IsValidIndex( I i ) const diff --git a/public/tier1/utlmemory.h b/public/tier1/utlmemory.h index 9c5a1d14..66c1f28d 100644 --- a/public/tier1/utlmemory.h +++ b/public/tier1/utlmemory.h @@ -22,9 +22,10 @@ #include "tier0/memalloc.h" #include "tier0/memdbgon.h" +#ifdef _WIN32 #pragma warning (disable:4100) #pragma warning (disable:4514) - +#endif //----------------------------------------------------------------------------- diff --git a/public/tier1/utlrbtree.h b/public/tier1/utlrbtree.h index 745c162d..3f8d27f8 100644 --- a/public/tier1/utlrbtree.h +++ b/public/tier1/utlrbtree.h @@ -664,8 +664,11 @@ inline void CUtlRBTree::SetColor( I i, typename CUtlRBTree I CUtlRBTree::NewNode() { @@ -710,7 +713,9 @@ I CUtlRBTree::NewNode() return elem; } +#ifdef _WIN32 #pragma warning(pop) +#endif template < class T, class I, typename L, class M > void CUtlRBTree::FreeNode( I i ) diff --git a/public/tier1/utlvector.h b/public/tier1/utlvector.h index 15b1e262..6fffc6a2 100644 --- a/public/tier1/utlvector.h +++ b/public/tier1/utlvector.h @@ -324,9 +324,12 @@ public: // Especialy useful if you have a lot of vectors that are sparse, or if you're // carefully packing holders of vectors //----------------------------------------------------------------------------- + +#ifdef _WIN32 #pragma warning(push) #pragma warning(disable : 4200) // warning C4200: nonstandard extension used : zero-sized array in struct/union #pragma warning(disable : 4815 ) // warning C4815: 'staticData' : zero-sized array in stack object will have no elements +#endif class CUtlVectorUltraConservativeAllocator { @@ -573,7 +576,9 @@ private: } }; +#ifdef _WIN32 #pragma warning(pop) +#endif // Make sure nobody adds multiple inheritance and makes this class bigger. COMPILE_TIME_ASSERT( sizeof(CUtlVectorUltraConservative) == sizeof(void*) ); diff --git a/public/togl/linuxwin/glmgr.h b/public/togl/linuxwin/glmgr.h index 0b056dca..7e76a682 100644 --- a/public/togl/linuxwin/glmgr.h +++ b/public/togl/linuxwin/glmgr.h @@ -214,8 +214,8 @@ struct GLClipPlaneEnable_t { GLint enable; inline bool operator==( struct GLClipPlaneEquation_t { GLfloat x,y,z,w; inline bool operator==(const GLClipPlaneEquation_t& src) const { return EQ(x) && EQ(y) && EQ(z) && EQ(w); } }; //blend -struct GLColorMaskSingle_t { char r,g,b,a; inline bool operator==(const GLColorMaskSingle_t& src) const { return EQ(r) && EQ(g) && EQ(b) && EQ(a); } }; -struct GLColorMaskMultiple_t { char r,g,b,a; inline bool operator==(const GLColorMaskMultiple_t& src) const { return EQ(r) && EQ(g) && EQ(b) && EQ(a); } }; +struct GLColorMaskSingle_t { signed char r,g,b,a; inline bool operator==(const GLColorMaskSingle_t& src) const { return EQ(r) && EQ(g) && EQ(b) && EQ(a); } }; +struct GLColorMaskMultiple_t { signed char r,g,b,a; inline bool operator==(const GLColorMaskMultiple_t& src) const { return EQ(r) && EQ(g) && EQ(b) && EQ(a); } }; struct GLBlendEnable_t { GLint enable; inline bool operator==(const GLBlendEnable_t& src) const { return EQ(enable); } }; struct GLBlendFactor_t { GLenum srcfactor,dstfactor; inline bool operator==(const GLBlendFactor_t& src) const { return EQ(srcfactor) && EQ(dstfactor); } }; struct GLBlendEquation_t { GLenum equation; inline bool operator==(const GLBlendEquation_t& src) const { return EQ(equation); } }; @@ -225,7 +225,7 @@ struct GLBlendEnableSRGB_t { GLint enable; inline bool operator==( //depth struct GLDepthTestEnable_t { GLint enable; inline bool operator==(const GLDepthTestEnable_t& src) const { return EQ(enable); } }; struct GLDepthFunc_t { GLenum func; inline bool operator==(const GLDepthFunc_t& src) const { return EQ(func); } }; -struct GLDepthMask_t { char mask; inline bool operator==(const GLDepthMask_t& src) const { return EQ(mask); } }; +struct GLDepthMask_t { char mask; inline bool operator==(const GLDepthMask_t& src) const { return EQ(mask); } }; //stencil struct GLStencilTestEnable_t { GLint enable; inline bool operator==(const GLStencilTestEnable_t& src) const { return EQ(enable); } }; diff --git a/public/vgui/VGUI.h b/public/vgui/VGUI.h index 34fddfeb..01557119 100644 --- a/public/vgui/VGUI.h +++ b/public/vgui/VGUI.h @@ -22,6 +22,7 @@ #endif #endif +#ifdef _WIN32 #pragma warning( disable: 4800 ) // disables 'performance warning converting int to bool' #pragma warning( disable: 4786 ) // disables 'identifier truncated in browser information' warning #pragma warning( disable: 4355 ) // disables 'this' : used in base member initializer list @@ -29,6 +30,7 @@ #pragma warning( disable: 4514 ) // warning C4514: 'Color::Color' : unreferenced inline function has been removed #pragma warning( disable: 4100 ) // warning C4100: 'code' : unreferenced formal parameter #pragma warning( disable: 4127 ) // warning C4127: conditional expression is constant +#endif typedef unsigned char uchar; typedef unsigned short ushort; diff --git a/public/vstdlib/pch_vstdlib.h b/public/vstdlib/pch_vstdlib.h index 57242c41..bef1b7f1 100644 --- a/public/vstdlib/pch_vstdlib.h +++ b/public/vstdlib/pch_vstdlib.h @@ -11,8 +11,9 @@ // $NoKeywords: $ //============================================================================= - +#ifdef _WIN32 #pragma warning(disable: 4514) +#endif // First include standard libraries #include diff --git a/public/vstdlib/random.h b/public/vstdlib/random.h index fdfd09c6..6909a107 100644 --- a/public/vstdlib/random.h +++ b/public/vstdlib/random.h @@ -16,8 +16,10 @@ #define NTAB 32 +#ifdef _WIN32 #pragma warning(push) #pragma warning( disable:4251 ) +#endif //----------------------------------------------------------------------------- // A generator of uniformly distributed random numbers @@ -114,8 +116,9 @@ public: //----------------------------------------------------------------------------- VSTDLIB_INTERFACE void InstallUniformRandomStream( IUniformRandomStream *pStream ); - +#ifdef _WIN32 #pragma warning(pop) +#endif #endif // VSTDLIB_RANDOM_H diff --git a/tier0/cpu.cpp b/tier0/cpu.cpp index 63c89d27..cf6fd787 100644 --- a/tier0/cpu.cpp +++ b/tier0/cpu.cpp @@ -22,7 +22,9 @@ const tchar* GetProcessorVendorId(); static bool cpuid(unsigned long function, unsigned long& out_eax, unsigned long& out_ebx, unsigned long& out_ecx, unsigned long& out_edx) { -#if defined(GNUC) +#if defined (__arm__) || defined( _X360 ) + return false; +#elif defined(GNUC) asm("mov %%ebx, %%esi\n\t" "cpuid\n\t" "xchg %%esi, %%ebx" @@ -30,11 +32,9 @@ static bool cpuid(unsigned long function, unsigned long& out_eax, unsigned long& "=S" (out_ebx), "=c" (out_ecx), "=d" (out_edx) - : "a" (function) + : "a" (function) ); return true; -#elif defined( _X360 ) - return false; #elif defined(_WIN64) int pCPUInfo[4]; __cpuid( pCPUInfo, (int)function ); @@ -142,21 +142,21 @@ static bool IsWin98OrOlder() static bool CheckSSETechnology(void) { -#if defined( _X360 ) || defined( _PS3 ) +#if defined( __ARM__ ) + return false; +#elif defined( _X360 ) || defined( _PS3 ) return true; #else - if ( IsWin98OrOlder() ) - { + if ( IsWin98OrOlder() ) { return false; } - unsigned long eax,ebx,edx,unused; - if ( !cpuid(1,eax,ebx,unused,edx) ) - { + unsigned long eax,ebx,edx,unused; + if ( !cpuid(1,eax,ebx,unused,edx) ) { return false; } - return ( edx & 0x2000000L ) != 0; + return ( edx & 0x2000000L ) != 0; #endif } diff --git a/tier0/cpu_posix.cpp b/tier0/cpu_posix.cpp index 994a8ab0..ebd3dc9a 100644 --- a/tier0/cpu_posix.cpp +++ b/tier0/cpu_posix.cpp @@ -106,7 +106,7 @@ uint64 CalculateCPUFreq() uint64 retVal = 1000000; return retVal * atoi( pFreq ); } -#endif +#else // Try to open cpuinfo_max_freq. If the kernel was built with cpu scaling support disabled, this will fail. FILE *fp = fopen( "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r" ); @@ -177,5 +177,6 @@ uint64 CalculateCPUFreq() } return period; +#endif } diff --git a/tier1/pathmatch.cpp b/tier1/pathmatch.cpp index f828a9ae..bd8d3cbd 100644 --- a/tier1/pathmatch.cpp +++ b/tier1/pathmatch.cpp @@ -57,6 +57,7 @@ #include #include + // Enable to do pathmatch caching. Beware: this code isn't threadsafe. // #define DO_PATHMATCH_CACHE @@ -66,7 +67,16 @@ static bool s_bShowDiag; #define DEBUG_MSG( ... ) if ( s_bShowDiag ) fprintf( stderr, ##__VA_ARGS__ ) + +#ifdef POSIX +#include +#define DEBUG_BREAK() raise(SIGINT) +#elif !defined (__arm__) #define DEBUG_BREAK() __asm__ __volatile__ ( "int $3" ) +#else +#define DEBUG_BREAK() +#endif + #define _COMPILE_TIME_ASSERT(pred) switch(0){case 0:case pred:;} #define WRAP( fn, ret, ... ) \ diff --git a/tier1/processor_detect_linux.cpp b/tier1/processor_detect_linux.cpp index c7c95d04..189fef8a 100644 --- a/tier1/processor_detect_linux.cpp +++ b/tier1/processor_detect_linux.cpp @@ -6,6 +6,14 @@ // $NoKeywords: $ //=============================================================================// + +#if defined (__arm__) +bool CheckMMXTechnology(void) { return false; } +bool CheckSSETechnology(void) { return false; } +bool CheckSSE2Technology(void) { return false; } +bool Check3DNowTechnology(void) { return false; } +#else + #define cpuid(in,a,b,c,d) \ asm("pushl %%ebx\n\t" "cpuid\n\t" "movl %%ebx,%%esi\n\t" "pop %%ebx": "=a" (a), "=S" (b), "=c" (c), "=d" (d) : "a" (in)); @@ -45,3 +53,5 @@ bool Check3DNowTechnology(void) } return false; } + +#endif diff --git a/tier1/reliabletimer.cpp b/tier1/reliabletimer.cpp index af8b8421..ab46596f 100644 --- a/tier1/reliabletimer.cpp +++ b/tier1/reliabletimer.cpp @@ -13,6 +13,10 @@ bool CReliableTimer::sm_bUseQPC = false; #include "winlite.h" #endif +#ifdef POSIX +#include +#endif + //----------------------------------------------------------------------------- // Purpose: Constructor //----------------------------------------------------------------------------- @@ -83,6 +87,10 @@ int64 CReliableTimer::GetPerformanceCountNow() uint64 ulNow; SYS_TIMEBASE_GET( ulNow ); return ulNow; +#elif defined( __arm__ ) && defined (POSIX) + struct timespec ts; + clock_gettime(CLOCK_REALTIME, &ts); + return ts.tv_sec * 1000000000ULL + ts.tv_nsec; #else uint64 un64; __asm__ __volatile__ ( diff --git a/tier1/strtools.cpp b/tier1/strtools.cpp index a63fa4fc..7f2fd270 100644 --- a/tier1/strtools.cpp +++ b/tier1/strtools.cpp @@ -3337,8 +3337,9 @@ const Tier1FullHTMLEntity_t g_Tier1_FullHTMLEntities[] = { L'\u00FF', "ÿ", 6 }, { 0, NULL, 0 } // sentinel for end of array }; +#ifdef _WIN32 #pragma warning( pop ) - +#endif bool V_BasicHtmlEntityEncode( char *pDest, const int nDestSize, char const *pIn, const int nInSize, bool bPreserveWhitespace /*= false*/ ) diff --git a/tier1/wscript b/tier1/wscript index 0df2dc4f..c8a8673e 100755 --- a/tier1/wscript +++ b/tier1/wscript @@ -68,7 +68,8 @@ def build(bld): '.', '../public', '../public/tier1', - '../public/tier0' + '../public/tier0', + '../common' ] defines = [] diff --git a/vstdlib/coroutine.cpp b/vstdlib/coroutine.cpp index 5cabe938..a1bcfc1d 100644 --- a/vstdlib/coroutine.cpp +++ b/vstdlib/coroutine.cpp @@ -223,11 +223,11 @@ extern "C" byte *GetStackPtr64(); // Apple's version of gcc/g++ doesn't return the expected value using the intrinsic, so // do it the old fashioned way - this will also use asm on linux (since we don't compile // with llvm/clang there) but that seems fine. -#if defined(__llvm__) || defined(__clang__) +//#if defined(__llvm__) || defined(__clang__) #define GetStackPtr( pStackPtr ) byte *pStackPtr = (byte*)__builtin_frame_address(0) -#else -#define GetStackPtr( pStackPtr ) register byte *pStackPtr __asm__( "esp" ) -#endif +//#else +//#define GetStackPtr( pStackPtr ) register byte *pStackPtr __asm__( "esp" ) +//#endif #elif defined(__SNC__) #define GetStackPtr( pStackPtr ) byte *pStackPtr = (byte*)__builtin_frame_address(0) #else diff --git a/wscript b/wscript index f408f343..fc5bfd2e 100644 --- a/wscript +++ b/wscript @@ -7,7 +7,7 @@ from waflib import Logs, Context, Configure import sys import os -VERSION = '0.99' +VERSION = '1.0' APPNAME = 'source-engine' top = '.' @@ -28,24 +28,91 @@ int main() { return (int)FcInit(); } Context.Context.line_just = 55 # should fit for everything on 80x26 -projects=[ - 'tier0','tier1','tier2', - 'vstdlib','vpklib','filesystem' - ,'mathlib','tier3', - 'bitmap','scenefilecache','datacache', - 'launcher_main','vgui2/vgui_controls','vgui2/matsys_controls','vgui2/vgui_surfacelib', - 'serverbrowser','soundemittersystem','vgui2/src', - 'togl','vguimatsurface','vtf','materialsystem/shaderlib', - 'materialsystem','studiorender','materialsystem/stdshaders', - 'video','inputsystem','appframework', - 'launcher','engine/voice_codecs/minimp3','materialsystem/shaderapidx9', - 'gameui','dmxloader','datamodel','engine','ivp/havana', - 'ivp/havana/havok/hk_math','ivp/havana/havok/hk_base', - 'ivp/ivp_compact_builder','ivp/ivp_physics','vphysics','game/server', - 'particles','choreoobjects','game/client' -] +projects={ + 'game': [ + 'appframework', + 'bitmap', + 'choreoobjects', + 'datacache', + 'datamodel', + 'dmxloader', + 'engine', + 'engine/voice_codecs/minimp3', + 'filesystem', + 'game/client', + 'game/server', + 'gameui', + 'inputsystem', + 'ivp/havana', + 'ivp/havana/havok/hk_base', + 'ivp/havana/havok/hk_math', + 'ivp/ivp_compact_builder', + 'ivp/ivp_physics', + 'launcher', + 'launcher_main', + 'materialsystem', + 'materialsystem/shaderapidx9', + 'materialsystem/shaderlib', + 'materialsystem/stdshaders', + 'mathlib', + 'particles', + 'scenefilecache', + 'serverbrowser', + 'soundemittersystem', + 'studiorender', + 'thirdparty/StubSteamAPI', + 'tier0', + 'tier1', + 'tier2', + 'tier3', + 'togl', + 'vgui2/matsys_controls', + 'vgui2/src', + 'vgui2/vgui_controls', + 'vgui2/vgui_surfacelib', + 'vguimatsurface', + 'video', + 'vphysics', + 'vpklib', + 'vstdlib', + 'vtf' + ], + 'dedicated': [ + 'appframework', + 'bitmap', + 'choreoobjects', + 'datacache', + 'dedicated', + 'dedicated_main', + 'dmxloader', + 'engine', + 'game/server', + 'ivp/havana', + 'ivp/havana/havok/hk_base', + 'ivp/havana/havok/hk_math', + 'ivp/ivp_compact_builder', + 'ivp/ivp_physics', + 'materialsystem', + 'mathlib', + 'particles', + 'scenefilecache', + 'materialsystem/shaderapiempty', + 'materialsystem/shaderlib', + 'soundemittersystem', + 'studiorender', + 'tier0', + 'tier1', + 'tier2', + 'tier3', + 'vphysics', + 'vpklib', + 'vstdlib', + 'vtf', + 'thirdparty/StubSteamAPI' + ] +} + -projects += ['thirdparty/StubSteamAPI'] # ,'thirdparty/libjpeg','thirdparty/SDL2-src'] # thirdparty projects @Configure.conf def check_pkg(conf, package, uselib_store, fragment, *k, **kw): @@ -70,7 +137,23 @@ def get_taskgen_count(self): return idx def define_platform(conf): - conf.define('SOURCE1',1) + conf.env.DEDICATED = conf.options.DEDICATED + + if conf.options.DEDICATED: + conf.options.SDL = False +# conf.options.GL = False + conf.define('DEDICATED', 1) + + if conf.options.GL: + conf.env.append_unique('DEFINES', [ + 'DX_TO_GL_ABSTRACTION', + 'GL_GLEXT_PROTOTYPES', + 'BINK_VIDEO' + ]) + + if conf.options.SDL: + conf.define('USE_SDL', 1) + if conf.env.DEST_OS == 'linux': conf.define('_GLIBCXX_USE_CXX11_ABI',0) conf.env.append_unique('DEFINES', [ @@ -79,24 +162,30 @@ def define_platform(conf): 'POSIX=1', '_POSIX=1', 'GNUC', - 'DX_TO_GL_ABSTRACTION', - 'GL_GLEXT_PROTOTYPES', - 'BINK_VIDEO', - 'USE_SDL', 'NDEBUG', 'NO_HOOK_MALLOC', '_DLL_EXT=.so' ]) + def options(opt): grp = opt.add_option_group('Common options') grp.add_option('-8', '--64bits', action = 'store_true', dest = 'ALLOW64', default = False, help = 'allow targetting 64-bit engine(Linux/Windows/OSX x86 only) [default: %default]') + grp.add_option('-d', '--dedicated', action = 'store_true', dest = 'DEDICATED', default = False, + help = 'build dedicated server [default: %default]') + + grp.add_option('--use-sdl', action = 'store', dest = 'SDL', type = 'int', default = True, + help = 'build engine with SDL [default: %default]') + + grp.add_option('--use-togl', action = 'store', dest = 'GL', type = 'int', default = True, + help = 'build engine with ToGL [default: %default]') + opt.load('compiler_optimizations subproject') - opt.add_subproject(projects) +# opt.add_subproject(projects['game']) opt.load('xcompile compiler_cxx compiler_c sdl2 clang_compilation_database strip_on_install waf_unit_test subproject') if sys.platform == 'win32': @@ -104,6 +193,7 @@ def options(opt): opt.load('reconfigure') def configure(conf): + conf.load('fwgslib reconfigure') # Force XP compability, all build targets should add @@ -115,13 +205,16 @@ def configure(conf): conf.load('msvc msvc_pdb msdev msvs') conf.load('subproject xcompile compiler_c compiler_cxx gitversion clang_compilation_database strip_on_install waf_unit_test enforce_pic') - conf.check_cfg(package='sdl2', uselib_store='SDL2', args=['--cflags', '--libs']) + if conf.options.SDL: + conf.check_cfg(package='sdl2', uselib_store='SDL2', args=['--cflags', '--libs']) + if conf.options.DEDICATED: + conf.check_cfg(package='libedit', uselib_store='EDIT', args=['--cflags', '--libs']) + conf.check_cfg(package='libjpeg', uselib_store='JPEG', args=['--cflags', '--libs']) conf.check_cfg(package='libpng', uselib_store='PNG', args=['--cflags', '--libs']) conf.check_cfg(package='zlib', uselib_store='ZLIB', args=['--cflags', '--libs']) conf.check_cfg(package='openal', uselib_store='OPENAL', args=['--cflags', '--libs']) conf.check_cfg(package='libcurl', uselib_store='CURL', args=['--cflags', '--libs']) - conf.check_cfg(package='bzip2', uselib_store='BZIP2', args=['--cflags', '--libs']) conf.check_pkg('freetype2', 'FT2', FT2_CHECK) conf.check_pkg('fontconfig', 'FC', FC_CHECK) @@ -150,8 +243,17 @@ def configure(conf): ] cflags, linkflags = conf.get_optimization_flags() - cflags += ['-march=pentium4','-mtune=core2','-mfpmath=387'] - linkflags += ['-march=pentium4','-mtune=core2','-mfpmath=387'] + + flags = ['-fPIC'] + + if conf.env.DEST_CPU == 'arm': + flags += ['-mfpu=neon'] + else: + flags += ['-march=pentium4','-mtune=core2','-mfpmath=387'] + + cflags += flags + linkflags += flags + # And here C++ flags starts to be treated separately cxxflags = list(cflags) + ['-std=c++11','-fpermissive'] @@ -180,9 +282,11 @@ def configure(conf): conf.env.append_unique('CFLAGS', cflags) conf.env.append_unique('CXXFLAGS', cxxflags) conf.env.append_unique('LINKFLAGS', linkflags) + conf.env.append_unique('INCLUDES', [os.path.abspath('common/')]) if conf.env.DEST_OS != 'win32': conf.check_cc(lib='dl', mandatory=False) + conf.check_cc(lib='bz2', mandatory=False) conf.check_cc(lib='rt', mandatory=False) if not conf.env.LIB_M: # HACK: already added in xcompile! @@ -221,7 +325,14 @@ def configure(conf): conf.env.LIBDIR = conf.env.BINDIR = conf.env.PREFIX define_platform(conf) - conf.add_subproject(projects) + + if conf.options.DEDICATED: + conf.add_subproject(projects['dedicated']) + else: + conf.add_subproject(projects['game']) def build(bld): - bld.add_subproject(projects) + if bld.env.DEDICATED: + bld.add_subproject(projects['dedicated']) + else: + bld.add_subproject(projects['game'])