From 50a93ce91a368e28aae20b9a1ea109ed9bdc51b4 Mon Sep 17 00:00:00 2001
From: nillerusr <nillerusr@gmail.com>
Date: Sun, 25 Apr 2021 23:36:09 +0300
Subject: [PATCH] build: arm target support

---
 common/curl/curl.h                    | 2119 -------
 common/curl/curlbuild.h               |  583 --
 common/curl/curlrules.h               |  252 -
 common/curl/curlver.h                 |   69 -
 common/curl/easy.h                    |  102 -
 common/curl/mprintf.h                 |   81 -
 common/curl/multi.h                   |  345 --
 common/curl/stdcheaders.h             |   33 -
 common/curl/typecheck-gcc.h           |  584 --
 common/curl/types.h                   |    1 -
 common/sse2neon.h                     | 7598 +++++++++++++++++++++++++
 dedicated/wscript                     |   68 +
 dedicated_main/main.cpp               |    3 +
 dedicated_main/wscript                |   43 +
 engine/sys_dll2.cpp                   |    2 +-
 engine/sys_engine.cpp                 |    7 +
 engine/wscript                        |  226 +-
 filesystem/basefilesystem.cpp         |    1 -
 inputsystem/wscript                   |    5 +-
 launcher_main/main.cpp                |    5 +-
 materialsystem/shaderapiempty/wscript |   52 +
 mathlib/sse.cpp                       |   34 +-
 public/bitmap/imageformat.h           |    2 +
 public/materialsystem/imesh.h         |   25 +-
 public/mathlib/mathlib.h              |   13 +-
 public/mathlib/ssemath.h              |    4 +-
 public/mathlib/vector.h               |   10 +-
 public/mathlib/vector4d.h             |   16 +-
 public/saverestoretypes.h             |   14 +-
 public/tier0/platform.h               |   14 +-
 public/tier0/threadtools.h            |    5 +-
 public/tier1/convar.h                 |    5 +-
 public/tier1/utlblockmemory.h         |    2 +
 public/tier1/utlfixedmemory.h         |    2 +
 public/tier1/utllinkedlist.h          |    4 +
 public/tier1/utlmemory.h              |    3 +-
 public/tier1/utlrbtree.h              |    5 +
 public/tier1/utlvector.h              |    5 +
 public/togl/linuxwin/glmgr.h          |    6 +-
 public/vgui/VGUI.h                    |    2 +
 public/vstdlib/pch_vstdlib.h          |    3 +-
 public/vstdlib/random.h               |    5 +-
 tier0/cpu.cpp                         |   22 +-
 tier0/cpu_posix.cpp                   |    3 +-
 tier1/pathmatch.cpp                   |   10 +
 tier1/processor_detect_linux.cpp      |   10 +
 tier1/reliabletimer.cpp               |    8 +
 tier1/strtools.cpp                    |    3 +-
 tier1/wscript                         |    3 +-
 vstdlib/coroutine.cpp                 |    8 +-
 wscript                               |  171 +-
 51 files changed, 8210 insertions(+), 4386 deletions(-)
 delete mode 100644 common/curl/curl.h
 delete mode 100644 common/curl/curlbuild.h
 delete mode 100644 common/curl/curlrules.h
 delete mode 100644 common/curl/curlver.h
 delete mode 100644 common/curl/easy.h
 delete mode 100644 common/curl/mprintf.h
 delete mode 100644 common/curl/multi.h
 delete mode 100644 common/curl/stdcheaders.h
 delete mode 100644 common/curl/typecheck-gcc.h
 delete mode 100644 common/curl/types.h
 create mode 100644 common/sse2neon.h
 create mode 100755 dedicated/wscript
 create mode 100755 dedicated_main/wscript
 create mode 100755 materialsystem/shaderapiempty/wscript

diff --git a/common/curl/curl.h b/common/curl/curl.h
deleted file mode 100644
index cb9d0fbf..00000000
--- a/common/curl/curl.h
+++ /dev/null
@@ -1,2119 +0,0 @@
-#ifndef __CURL_CURL_H
-#define __CURL_CURL_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-
-/*
- * If you have libcurl problems, all docs and details are found here:
- *   http://curl.haxx.se/libcurl/
- *
- * curl-library mailing list subscription and unsubscription web interface:
- *   http://cool.haxx.se/mailman/listinfo/curl-library/
- */
-
-#include "curlver.h"         /* libcurl version defines   */
-#include "curlbuild.h"       /* libcurl build definitions */
-#include "curlrules.h"       /* libcurl rules enforcement */
-
-/*
- * Define WIN32 when build target is Win32 API
- */
-
-#if (defined(_WIN32) || defined(__WIN32__)) && \
-     !defined(WIN32) && !defined(__SYMBIAN32__)
-#define WIN32
-#endif
-
-#include <stdio.h>
-#include <limits.h>
-
-#if defined(__FreeBSD__) && (__FreeBSD__ >= 2)
-/* Needed for __FreeBSD_version symbol definition */
-#include <osreldate.h>
-#endif
-
-/* The include stuff here below is mainly for time_t! */
-#include <sys/types.h>
-#include <time.h>
-
-#if defined(WIN32) && !defined(_WIN32_WCE) && !defined(__GNUC__) && \
-  !defined(__CYGWIN__) || defined(__MINGW32__)
-#if !(defined(_WINSOCKAPI_) || defined(_WINSOCK_H))
-/* The check above prevents the winsock2 inclusion if winsock.h already was
-   included, since they can't co-exist without problems */
-#include <winsock2.h>
-#include <ws2tcpip.h>
-#endif
-#else
-
-/* HP-UX systems version 9, 10 and 11 lack sys/select.h and so does oldish
-   libc5-based Linux systems. Only include it on system that are known to
-   require it! */
-#if defined(_AIX) || defined(__NOVELL_LIBC__) || defined(__NetBSD__) || \
-    defined(__minix) || defined(__SYMBIAN32__) || defined(__INTEGRITY) || \
-    defined(ANDROID) || \
-   (defined(__FreeBSD_version) && (__FreeBSD_version < 800000))
-#include <sys/select.h>
-#endif
-
-#ifndef _WIN32_WCE
-#include <sys/socket.h>
-#endif
-#if !defined(WIN32) && !defined(__WATCOMC__) && !defined(__VXWORKS__)
-#include <sys/time.h>
-#endif
-#include <sys/types.h>
-#endif
-
-#ifdef __BEOS__
-#include <support/SupportDefs.h>
-#endif
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-typedef void CURL;
-
-/*
- * Decorate exportable functions for Win32 and Symbian OS DLL linking.
- * This avoids using a .def file for building libcurl.dll.
- */
-#if (defined(WIN32) || defined(_WIN32) || defined(__SYMBIAN32__)) && \
-     !defined(CURL_STATICLIB)
-#if defined(BUILDING_LIBCURL)
-#define CURL_EXTERN  __declspec(dllexport)
-#else
-#define CURL_EXTERN  __declspec(dllimport)
-#endif
-#else
-
-#ifdef CURL_HIDDEN_SYMBOLS
-/*
- * This definition is used to make external definitions visible in the
- * shared library when symbols are hidden by default.  It makes no
- * difference when compiling applications whether this is set or not,
- * only when compiling the library.
- */
-#define CURL_EXTERN CURL_EXTERN_SYMBOL
-#else
-#define CURL_EXTERN
-#endif
-#endif
-
-#ifndef curl_socket_typedef
-/* socket typedef */
-#ifdef WIN32
-typedef SOCKET curl_socket_t;
-#define CURL_SOCKET_BAD INVALID_SOCKET
-#else
-typedef int curl_socket_t;
-#define CURL_SOCKET_BAD -1
-#endif
-#define curl_socket_typedef
-#endif /* curl_socket_typedef */
-
-struct curl_httppost {
-  struct curl_httppost *next;       /* next entry in the list */
-  char *name;                       /* pointer to allocated name */
-  long namelength;                  /* length of name length */
-  char *contents;                   /* pointer to allocated data contents */
-  long contentslength;              /* length of contents field */
-  char *buffer;                     /* pointer to allocated buffer contents */
-  long bufferlength;                /* length of buffer field */
-  char *contenttype;                /* Content-Type */
-  struct curl_slist* contentheader; /* list of extra headers for this form */
-  struct curl_httppost *more;       /* if one field name has more than one
-                                       file, this link should link to following
-                                       files */
-  long flags;                       /* as defined below */
-#define HTTPPOST_FILENAME (1<<0)    /* specified content is a file name */
-#define HTTPPOST_READFILE (1<<1)    /* specified content is a file name */
-#define HTTPPOST_PTRNAME (1<<2)     /* name is only stored pointer
-                                       do not free in formfree */
-#define HTTPPOST_PTRCONTENTS (1<<3) /* contents is only stored pointer
-                                       do not free in formfree */
-#define HTTPPOST_BUFFER (1<<4)      /* upload file from buffer */
-#define HTTPPOST_PTRBUFFER (1<<5)   /* upload file from pointer contents */
-#define HTTPPOST_CALLBACK (1<<6)    /* upload file contents by using the
-                                       regular read callback to get the data
-                                       and pass the given pointer as custom
-                                       pointer */
-
-  char *showfilename;               /* The file name to show. If not set, the
-                                       actual file name will be used (if this
-                                       is a file part) */
-  void *userp;                      /* custom pointer used for
-                                       HTTPPOST_CALLBACK posts */
-};
-
-typedef int (*curl_progress_callback)(void *clientp,
-                                      double dltotal,
-                                      double dlnow,
-                                      double ultotal,
-                                      double ulnow);
-
-#ifndef CURL_MAX_WRITE_SIZE
-  /* Tests have proven that 20K is a very bad buffer size for uploads on
-     Windows, while 16K for some odd reason performed a lot better.
-     We do the ifndef check to allow this value to easier be changed at build
-     time for those who feel adventurous. The practical minimum is about
-     400 bytes since libcurl uses a buffer of this size as a scratch area
-     (unrelated to network send operations). */
-#define CURL_MAX_WRITE_SIZE 16384
-#endif
-
-#ifndef CURL_MAX_HTTP_HEADER
-/* The only reason to have a max limit for this is to avoid the risk of a bad
-   server feeding libcurl with a never-ending header that will cause reallocs
-   infinitely */
-#define CURL_MAX_HTTP_HEADER (100*1024)
-#endif
-
-
-/* This is a magic return code for the write callback that, when returned,
-   will signal libcurl to pause receiving on the current transfer. */
-#define CURL_WRITEFUNC_PAUSE 0x10000001
-typedef size_t (*curl_write_callback)(char *buffer,
-                                      size_t size,
-                                      size_t nitems,
-                                      void *outstream);
-
-
-
-/* enumeration of file types */
-typedef enum {
-  CURLFILETYPE_FILE = 0,
-  CURLFILETYPE_DIRECTORY,
-  CURLFILETYPE_SYMLINK,
-  CURLFILETYPE_DEVICE_BLOCK,
-  CURLFILETYPE_DEVICE_CHAR,
-  CURLFILETYPE_NAMEDPIPE,
-  CURLFILETYPE_SOCKET,
-  CURLFILETYPE_DOOR, /* is possible only on Sun Solaris now */
-
-  CURLFILETYPE_UNKNOWN /* should never occur */
-} curlfiletype;
-
-#define CURLFINFOFLAG_KNOWN_FILENAME    (1<<0)
-#define CURLFINFOFLAG_KNOWN_FILETYPE    (1<<1)
-#define CURLFINFOFLAG_KNOWN_TIME        (1<<2)
-#define CURLFINFOFLAG_KNOWN_PERM        (1<<3)
-#define CURLFINFOFLAG_KNOWN_UID         (1<<4)
-#define CURLFINFOFLAG_KNOWN_GID         (1<<5)
-#define CURLFINFOFLAG_KNOWN_SIZE        (1<<6)
-#define CURLFINFOFLAG_KNOWN_HLINKCOUNT  (1<<7)
-
-/* Content of this structure depends on information which is known and is
-   achievable (e.g. by FTP LIST parsing). Please see the url_easy_setopt(3) man
-   page for callbacks returning this structure -- some fields are mandatory,
-   some others are optional. The FLAG field has special meaning. */
-struct curl_fileinfo {
-  char *filename;
-  curlfiletype filetype;
-  time_t time;
-  unsigned int perm;
-  int uid;
-  int gid;
-  curl_off_t size;
-  long int hardlinks;
-
-  struct {
-    /* If some of these fields is not NULL, it is a pointer to b_data. */
-    char *time;
-    char *perm;
-    char *user;
-    char *group;
-    char *target; /* pointer to the target filename of a symlink */
-  } strings;
-
-  unsigned int flags;
-
-  /* used internally */
-  char * b_data;
-  size_t b_size;
-  size_t b_used;
-};
-
-/* return codes for CURLOPT_CHUNK_BGN_FUNCTION */
-#define CURL_CHUNK_BGN_FUNC_OK      0
-#define CURL_CHUNK_BGN_FUNC_FAIL    1 /* tell the lib to end the task */
-#define CURL_CHUNK_BGN_FUNC_SKIP    2 /* skip this chunk over */
-
-/* if splitting of data transfer is enabled, this callback is called before
-   download of an individual chunk started. Note that parameter "remains" works
-   only for FTP wildcard downloading (for now), otherwise is not used */
-typedef long (*curl_chunk_bgn_callback)(const void *transfer_info,
-                                        void *ptr,
-                                        int remains);
-
-/* return codes for CURLOPT_CHUNK_END_FUNCTION */
-#define CURL_CHUNK_END_FUNC_OK      0
-#define CURL_CHUNK_END_FUNC_FAIL    1 /* tell the lib to end the task */
-
-/* If splitting of data transfer is enabled this callback is called after
-   download of an individual chunk finished.
-   Note! After this callback was set then it have to be called FOR ALL chunks.
-   Even if downloading of this chunk was skipped in CHUNK_BGN_FUNC.
-   This is the reason why we don't need "transfer_info" parameter in this
-   callback and we are not interested in "remains" parameter too. */
-typedef long (*curl_chunk_end_callback)(void *ptr);
-
-/* return codes for FNMATCHFUNCTION */
-#define CURL_FNMATCHFUNC_MATCH    0 /* string corresponds to the pattern */
-#define CURL_FNMATCHFUNC_NOMATCH  1 /* pattern doesn't match the string */
-#define CURL_FNMATCHFUNC_FAIL     2 /* an error occurred */
-
-/* callback type for wildcard downloading pattern matching. If the
-   string matches the pattern, return CURL_FNMATCHFUNC_MATCH value, etc. */
-typedef int (*curl_fnmatch_callback)(void *ptr,
-                                     const char *pattern,
-                                     const char *string);
-
-/* These are the return codes for the seek callbacks */
-#define CURL_SEEKFUNC_OK       0
-#define CURL_SEEKFUNC_FAIL     1 /* fail the entire transfer */
-#define CURL_SEEKFUNC_CANTSEEK 2 /* tell libcurl seeking can't be done, so
-                                    libcurl might try other means instead */
-typedef int (*curl_seek_callback)(void *instream,
-                                  curl_off_t offset,
-                                  int origin); /* 'whence' */
-
-/* This is a return code for the read callback that, when returned, will
-   signal libcurl to immediately abort the current transfer. */
-#define CURL_READFUNC_ABORT 0x10000000
-/* This is a return code for the read callback that, when returned, will
-   signal libcurl to pause sending data on the current transfer. */
-#define CURL_READFUNC_PAUSE 0x10000001
-
-typedef size_t (*curl_read_callback)(char *buffer,
-                                      size_t size,
-                                      size_t nitems,
-                                      void *instream);
-
-typedef enum  {
-  CURLSOCKTYPE_IPCXN, /* socket created for a specific IP connection */
-  CURLSOCKTYPE_LAST   /* never use */
-} curlsocktype;
-
-typedef int (*curl_sockopt_callback)(void *clientp,
-                                     curl_socket_t curlfd,
-                                     curlsocktype purpose);
-
-struct curl_sockaddr {
-  int family;
-  int socktype;
-  int protocol;
-  unsigned int addrlen; /* addrlen was a socklen_t type before 7.18.0 but it
-                           turned really ugly and painful on the systems that
-                           lack this type */
-  struct sockaddr addr;
-};
-
-typedef curl_socket_t
-(*curl_opensocket_callback)(void *clientp,
-                            curlsocktype purpose,
-                            struct curl_sockaddr *address);
-
-typedef enum {
-  CURLIOE_OK,            /* I/O operation successful */
-  CURLIOE_UNKNOWNCMD,    /* command was unknown to callback */
-  CURLIOE_FAILRESTART,   /* failed to restart the read */
-  CURLIOE_LAST           /* never use */
-} curlioerr;
-
-typedef enum  {
-  CURLIOCMD_NOP,         /* no operation */
-  CURLIOCMD_RESTARTREAD, /* restart the read stream from start */
-  CURLIOCMD_LAST         /* never use */
-} curliocmd;
-
-typedef curlioerr (*curl_ioctl_callback)(CURL *handle,
-                                         int cmd,
-                                         void *clientp);
-
-/*
- * The following typedef's are signatures of malloc, free, realloc, strdup and
- * calloc respectively.  Function pointers of these types can be passed to the
- * curl_global_init_mem() function to set user defined memory management
- * callback routines.
- */
-typedef void *(*curl_malloc_callback)(size_t size);
-typedef void (*curl_free_callback)(void *ptr);
-typedef void *(*curl_realloc_callback)(void *ptr, size_t size);
-typedef char *(*curl_strdup_callback)(const char *str);
-typedef void *(*curl_calloc_callback)(size_t nmemb, size_t size);
-
-/* the kind of data that is passed to information_callback*/
-typedef enum {
-  CURLINFO_TEXT = 0,
-  CURLINFO_HEADER_IN,    /* 1 */
-  CURLINFO_HEADER_OUT,   /* 2 */
-  CURLINFO_DATA_IN,      /* 3 */
-  CURLINFO_DATA_OUT,     /* 4 */
-  CURLINFO_SSL_DATA_IN,  /* 5 */
-  CURLINFO_SSL_DATA_OUT, /* 6 */
-  CURLINFO_END
-} curl_infotype;
-
-typedef int (*curl_debug_callback)
-       (CURL *handle,      /* the handle/transfer this concerns */
-        curl_infotype type, /* what kind of data */
-        char *data,        /* points to the data */
-        size_t size,       /* size of the data pointed to */
-        void *userptr);    /* whatever the user please */
-
-/* All possible error codes from all sorts of curl functions. Future versions
-   may return other values, stay prepared.
-
-   Always add new return codes last. Never *EVER* remove any. The return
-   codes must remain the same!
- */
-
-typedef enum {
-  CURLE_OK = 0,
-  CURLE_UNSUPPORTED_PROTOCOL,    /* 1 */
-  CURLE_FAILED_INIT,             /* 2 */
-  CURLE_URL_MALFORMAT,           /* 3 */
-  CURLE_OBSOLETE4,               /* 4 - NOT USED */
-  CURLE_COULDNT_RESOLVE_PROXY,   /* 5 */
-  CURLE_COULDNT_RESOLVE_HOST,    /* 6 */
-  CURLE_COULDNT_CONNECT,         /* 7 */
-  CURLE_FTP_WEIRD_SERVER_REPLY,  /* 8 */
-  CURLE_REMOTE_ACCESS_DENIED,    /* 9 a service was denied by the server
-                                    due to lack of access - when login fails
-                                    this is not returned. */
-  CURLE_OBSOLETE10,              /* 10 - NOT USED */
-  CURLE_FTP_WEIRD_PASS_REPLY,    /* 11 */
-  CURLE_OBSOLETE12,              /* 12 - NOT USED */
-  CURLE_FTP_WEIRD_PASV_REPLY,    /* 13 */
-  CURLE_FTP_WEIRD_227_FORMAT,    /* 14 */
-  CURLE_FTP_CANT_GET_HOST,       /* 15 */
-  CURLE_OBSOLETE16,              /* 16 - NOT USED */
-  CURLE_FTP_COULDNT_SET_TYPE,    /* 17 */
-  CURLE_PARTIAL_FILE,            /* 18 */
-  CURLE_FTP_COULDNT_RETR_FILE,   /* 19 */
-  CURLE_OBSOLETE20,              /* 20 - NOT USED */
-  CURLE_QUOTE_ERROR,             /* 21 - quote command failure */
-  CURLE_HTTP_RETURNED_ERROR,     /* 22 */
-  CURLE_WRITE_ERROR,             /* 23 */
-  CURLE_OBSOLETE24,              /* 24 - NOT USED */
-  CURLE_UPLOAD_FAILED,           /* 25 - failed upload "command" */
-  CURLE_READ_ERROR,              /* 26 - couldn't open/read from file */
-  CURLE_OUT_OF_MEMORY,           /* 27 */
-  /* Note: CURLE_OUT_OF_MEMORY may sometimes indicate a conversion error
-           instead of a memory allocation error if CURL_DOES_CONVERSIONS
-           is defined
-  */
-  CURLE_OPERATION_TIMEDOUT,      /* 28 - the timeout time was reached */
-  CURLE_OBSOLETE29,              /* 29 - NOT USED */
-  CURLE_FTP_PORT_FAILED,         /* 30 - FTP PORT operation failed */
-  CURLE_FTP_COULDNT_USE_REST,    /* 31 - the REST command failed */
-  CURLE_OBSOLETE32,              /* 32 - NOT USED */
-  CURLE_RANGE_ERROR,             /* 33 - RANGE "command" didn't work */
-  CURLE_HTTP_POST_ERROR,         /* 34 */
-  CURLE_SSL_CONNECT_ERROR,       /* 35 - wrong when connecting with SSL */
-  CURLE_BAD_DOWNLOAD_RESUME,     /* 36 - couldn't resume download */
-  CURLE_FILE_COULDNT_READ_FILE,  /* 37 */
-  CURLE_LDAP_CANNOT_BIND,        /* 38 */
-  CURLE_LDAP_SEARCH_FAILED,      /* 39 */
-  CURLE_OBSOLETE40,              /* 40 - NOT USED */
-  CURLE_FUNCTION_NOT_FOUND,      /* 41 */
-  CURLE_ABORTED_BY_CALLBACK,     /* 42 */
-  CURLE_BAD_FUNCTION_ARGUMENT,   /* 43 */
-  CURLE_OBSOLETE44,              /* 44 - NOT USED */
-  CURLE_INTERFACE_FAILED,        /* 45 - CURLOPT_INTERFACE failed */
-  CURLE_OBSOLETE46,              /* 46 - NOT USED */
-  CURLE_TOO_MANY_REDIRECTS ,     /* 47 - catch endless re-direct loops */
-  CURLE_UNKNOWN_TELNET_OPTION,   /* 48 - User specified an unknown option */
-  CURLE_TELNET_OPTION_SYNTAX ,   /* 49 - Malformed telnet option */
-  CURLE_OBSOLETE50,              /* 50 - NOT USED */
-  CURLE_PEER_FAILED_VERIFICATION, /* 51 - peer's certificate or fingerprint
-                                     wasn't verified fine */
-  CURLE_GOT_NOTHING,             /* 52 - when this is a specific error */
-  CURLE_SSL_ENGINE_NOTFOUND,     /* 53 - SSL crypto engine not found */
-  CURLE_SSL_ENGINE_SETFAILED,    /* 54 - can not set SSL crypto engine as
-                                    default */
-  CURLE_SEND_ERROR,              /* 55 - failed sending network data */
-  CURLE_RECV_ERROR,              /* 56 - failure in receiving network data */
-  CURLE_OBSOLETE57,              /* 57 - NOT IN USE */
-  CURLE_SSL_CERTPROBLEM,         /* 58 - problem with the local certificate */
-  CURLE_SSL_CIPHER,              /* 59 - couldn't use specified cipher */
-  CURLE_SSL_CACERT,              /* 60 - problem with the CA cert (path?) */
-  CURLE_BAD_CONTENT_ENCODING,    /* 61 - Unrecognized transfer encoding */
-  CURLE_LDAP_INVALID_URL,        /* 62 - Invalid LDAP URL */
-  CURLE_FILESIZE_EXCEEDED,       /* 63 - Maximum file size exceeded */
-  CURLE_USE_SSL_FAILED,          /* 64 - Requested FTP SSL level failed */
-  CURLE_SEND_FAIL_REWIND,        /* 65 - Sending the data requires a rewind
-                                    that failed */
-  CURLE_SSL_ENGINE_INITFAILED,   /* 66 - failed to initialise ENGINE */
-  CURLE_LOGIN_DENIED,            /* 67 - user, password or similar was not
-                                    accepted and we failed to login */
-  CURLE_TFTP_NOTFOUND,           /* 68 - file not found on server */
-  CURLE_TFTP_PERM,               /* 69 - permission problem on server */
-  CURLE_REMOTE_DISK_FULL,        /* 70 - out of disk space on server */
-  CURLE_TFTP_ILLEGAL,            /* 71 - Illegal TFTP operation */
-  CURLE_TFTP_UNKNOWNID,          /* 72 - Unknown transfer ID */
-  CURLE_REMOTE_FILE_EXISTS,      /* 73 - File already exists */
-  CURLE_TFTP_NOSUCHUSER,         /* 74 - No such user */
-  CURLE_CONV_FAILED,             /* 75 - conversion failed */
-  CURLE_CONV_REQD,               /* 76 - caller must register conversion
-                                    callbacks using curl_easy_setopt options
-                                    CURLOPT_CONV_FROM_NETWORK_FUNCTION,
-                                    CURLOPT_CONV_TO_NETWORK_FUNCTION, and
-                                    CURLOPT_CONV_FROM_UTF8_FUNCTION */
-  CURLE_SSL_CACERT_BADFILE,      /* 77 - could not load CACERT file, missing
-                                    or wrong format */
-  CURLE_REMOTE_FILE_NOT_FOUND,   /* 78 - remote file not found */
-  CURLE_SSH,                     /* 79 - error from the SSH layer, somewhat
-                                    generic so the error message will be of
-                                    interest when this has happened */
-
-  CURLE_SSL_SHUTDOWN_FAILED,     /* 80 - Failed to shut down the SSL
-                                    connection */
-  CURLE_AGAIN,                   /* 81 - socket is not ready for send/recv,
-                                    wait till it's ready and try again (Added
-                                    in 7.18.2) */
-  CURLE_SSL_CRL_BADFILE,         /* 82 - could not load CRL file, missing or
-                                    wrong format (Added in 7.19.0) */
-  CURLE_SSL_ISSUER_ERROR,        /* 83 - Issuer check failed.  (Added in
-                                    7.19.0) */
-  CURLE_FTP_PRET_FAILED,         /* 84 - a PRET command failed */
-  CURLE_RTSP_CSEQ_ERROR,         /* 85 - mismatch of RTSP CSeq numbers */
-  CURLE_RTSP_SESSION_ERROR,      /* 86 - mismatch of RTSP Session Identifiers */
-  CURLE_FTP_BAD_FILE_LIST,       /* 87 - unable to parse FTP file list */
-  CURLE_CHUNK_FAILED,            /* 88 - chunk callback reported error */
-
-  CURL_LAST /* never use! */
-} CURLcode;
-
-#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all
-                          the obsolete stuff removed! */
-
-/* Backwards compatibility with older names */
-
-/* The following were added in 7.17.1 */
-/* These are scheduled to disappear by 2009 */
-#define CURLE_SSL_PEER_CERTIFICATE CURLE_PEER_FAILED_VERIFICATION
-
-/* The following were added in 7.17.0 */
-/* These are scheduled to disappear by 2009 */
-#define CURLE_OBSOLETE CURLE_OBSOLETE50 /* noone should be using this! */
-#define CURLE_BAD_PASSWORD_ENTERED CURLE_OBSOLETE46
-#define CURLE_BAD_CALLING_ORDER CURLE_OBSOLETE44
-#define CURLE_FTP_USER_PASSWORD_INCORRECT CURLE_OBSOLETE10
-#define CURLE_FTP_CANT_RECONNECT CURLE_OBSOLETE16
-#define CURLE_FTP_COULDNT_GET_SIZE CURLE_OBSOLETE32
-#define CURLE_FTP_COULDNT_SET_ASCII CURLE_OBSOLETE29
-#define CURLE_FTP_WEIRD_USER_REPLY CURLE_OBSOLETE12
-#define CURLE_FTP_WRITE_ERROR CURLE_OBSOLETE20
-#define CURLE_LIBRARY_NOT_FOUND CURLE_OBSOLETE40
-#define CURLE_MALFORMAT_USER CURLE_OBSOLETE24
-#define CURLE_SHARE_IN_USE CURLE_OBSOLETE57
-#define CURLE_URL_MALFORMAT_USER CURLE_OBSOLETE4
-
-#define CURLE_FTP_ACCESS_DENIED CURLE_REMOTE_ACCESS_DENIED
-#define CURLE_FTP_COULDNT_SET_BINARY CURLE_FTP_COULDNT_SET_TYPE
-#define CURLE_FTP_QUOTE_ERROR CURLE_QUOTE_ERROR
-#define CURLE_TFTP_DISKFULL CURLE_REMOTE_DISK_FULL
-#define CURLE_TFTP_EXISTS CURLE_REMOTE_FILE_EXISTS
-#define CURLE_HTTP_RANGE_ERROR CURLE_RANGE_ERROR
-#define CURLE_FTP_SSL_FAILED CURLE_USE_SSL_FAILED
-
-/* The following were added earlier */
-
-#define CURLE_OPERATION_TIMEOUTED CURLE_OPERATION_TIMEDOUT
-
-#define CURLE_HTTP_NOT_FOUND CURLE_HTTP_RETURNED_ERROR
-#define CURLE_HTTP_PORT_FAILED CURLE_INTERFACE_FAILED
-#define CURLE_FTP_COULDNT_STOR_FILE CURLE_UPLOAD_FAILED
-
-#define CURLE_FTP_PARTIAL_FILE CURLE_PARTIAL_FILE
-#define CURLE_FTP_BAD_DOWNLOAD_RESUME CURLE_BAD_DOWNLOAD_RESUME
-
-/* This was the error code 50 in 7.7.3 and a few earlier versions, this
-   is no longer used by libcurl but is instead #defined here only to not
-   make programs break */
-#define CURLE_ALREADY_COMPLETE 99999
-
-#endif /*!CURL_NO_OLDIES*/
-
-/* This prototype applies to all conversion callbacks */
-typedef CURLcode (*curl_conv_callback)(char *buffer, size_t length);
-
-typedef CURLcode (*curl_ssl_ctx_callback)(CURL *curl,    /* easy handle */
-                                          void *ssl_ctx, /* actually an
-                                                            OpenSSL SSL_CTX */
-                                          void *userptr);
-
-typedef enum {
-  CURLPROXY_HTTP = 0,   /* added in 7.10, new in 7.19.4 default is to use
-                           CONNECT HTTP/1.1 */
-  CURLPROXY_HTTP_1_0 = 1,   /* added in 7.19.4, force to use CONNECT
-                               HTTP/1.0  */
-  CURLPROXY_SOCKS4 = 4, /* support added in 7.15.2, enum existed already
-                           in 7.10 */
-  CURLPROXY_SOCKS5 = 5, /* added in 7.10 */
-  CURLPROXY_SOCKS4A = 6, /* added in 7.18.0 */
-  CURLPROXY_SOCKS5_HOSTNAME = 7 /* Use the SOCKS5 protocol but pass along the
-                                   host name rather than the IP address. added
-                                   in 7.18.0 */
-} curl_proxytype;  /* this enum was added in 7.10 */
-
-#define CURLAUTH_NONE         0       /* nothing */
-#define CURLAUTH_BASIC        (1<<0)  /* Basic (default) */
-#define CURLAUTH_DIGEST       (1<<1)  /* Digest */
-#define CURLAUTH_GSSNEGOTIATE (1<<2)  /* GSS-Negotiate */
-#define CURLAUTH_NTLM         (1<<3)  /* NTLM */
-#define CURLAUTH_DIGEST_IE    (1<<4)  /* Digest with IE flavour */
-#define CURLAUTH_ANY (~CURLAUTH_DIGEST_IE)  /* all fine types set */
-#define CURLAUTH_ANYSAFE (~(CURLAUTH_BASIC|CURLAUTH_DIGEST_IE))
-
-#define CURLSSH_AUTH_ANY       ~0     /* all types supported by the server */
-#define CURLSSH_AUTH_NONE      0      /* none allowed, silly but complete */
-#define CURLSSH_AUTH_PUBLICKEY (1<<0) /* public/private key files */
-#define CURLSSH_AUTH_PASSWORD  (1<<1) /* password */
-#define CURLSSH_AUTH_HOST      (1<<2) /* host key files */
-#define CURLSSH_AUTH_KEYBOARD  (1<<3) /* keyboard interactive */
-#define CURLSSH_AUTH_DEFAULT CURLSSH_AUTH_ANY
-
-#define CURL_ERROR_SIZE 256
-
-struct curl_khkey {
-  const char *key; /* points to a zero-terminated string encoded with base64
-                      if len is zero, otherwise to the "raw" data */
-  size_t len;
-  enum type {
-    CURLKHTYPE_UNKNOWN,
-    CURLKHTYPE_RSA1,
-    CURLKHTYPE_RSA,
-    CURLKHTYPE_DSS
-  } keytype;
-};
-
-/* this is the set of return values expected from the curl_sshkeycallback
-   callback */
-enum curl_khstat {
-  CURLKHSTAT_FINE_ADD_TO_FILE,
-  CURLKHSTAT_FINE,
-  CURLKHSTAT_REJECT, /* reject the connection, return an error */
-  CURLKHSTAT_DEFER,  /* do not accept it, but we can't answer right now so
-                        this causes a CURLE_DEFER error but otherwise the
-                        connection will be left intact etc */
-  CURLKHSTAT_LAST    /* not for use, only a marker for last-in-list */
-};
-
-/* this is the set of status codes pass in to the callback */
-enum curl_khmatch {
-  CURLKHMATCH_OK,       /* match */
-  CURLKHMATCH_MISMATCH, /* host found, key mismatch! */
-  CURLKHMATCH_MISSING,  /* no matching host/key found */
-  CURLKHMATCH_LAST      /* not for use, only a marker for last-in-list */
-};
-
-typedef int
-  (*curl_sshkeycallback) (CURL *easy,     /* easy handle */
-                          const struct curl_khkey *knownkey, /* known */
-                          const struct curl_khkey *foundkey, /* found */
-                          enum curl_khmatch, /* libcurl's view on the keys */
-                          void *clientp); /* custom pointer passed from app */
-
-/* parameter for the CURLOPT_USE_SSL option */
-typedef enum {
-  CURLUSESSL_NONE,    /* do not attempt to use SSL */
-  CURLUSESSL_TRY,     /* try using SSL, proceed anyway otherwise */
-  CURLUSESSL_CONTROL, /* SSL for the control connection or fail */
-  CURLUSESSL_ALL,     /* SSL for all communication or fail */
-  CURLUSESSL_LAST     /* not an option, never use */
-} curl_usessl;
-
-#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all
-                          the obsolete stuff removed! */
-
-/* Backwards compatibility with older names */
-/* These are scheduled to disappear by 2009 */
-
-#define CURLFTPSSL_NONE CURLUSESSL_NONE
-#define CURLFTPSSL_TRY CURLUSESSL_TRY
-#define CURLFTPSSL_CONTROL CURLUSESSL_CONTROL
-#define CURLFTPSSL_ALL CURLUSESSL_ALL
-#define CURLFTPSSL_LAST CURLUSESSL_LAST
-#define curl_ftpssl curl_usessl
-#endif /*!CURL_NO_OLDIES*/
-
-/* parameter for the CURLOPT_FTP_SSL_CCC option */
-typedef enum {
-  CURLFTPSSL_CCC_NONE,    /* do not send CCC */
-  CURLFTPSSL_CCC_PASSIVE, /* Let the server initiate the shutdown */
-  CURLFTPSSL_CCC_ACTIVE,  /* Initiate the shutdown */
-  CURLFTPSSL_CCC_LAST     /* not an option, never use */
-} curl_ftpccc;
-
-/* parameter for the CURLOPT_FTPSSLAUTH option */
-typedef enum {
-  CURLFTPAUTH_DEFAULT, /* let libcurl decide */
-  CURLFTPAUTH_SSL,     /* use "AUTH SSL" */
-  CURLFTPAUTH_TLS,     /* use "AUTH TLS" */
-  CURLFTPAUTH_LAST /* not an option, never use */
-} curl_ftpauth;
-
-/* parameter for the CURLOPT_FTP_CREATE_MISSING_DIRS option */
-typedef enum {
-  CURLFTP_CREATE_DIR_NONE,  /* do NOT create missing dirs! */
-  CURLFTP_CREATE_DIR,       /* (FTP/SFTP) if CWD fails, try MKD and then CWD
-                               again if MKD succeeded, for SFTP this does
-                               similar magic */
-  CURLFTP_CREATE_DIR_RETRY, /* (FTP only) if CWD fails, try MKD and then CWD
-                               again even if MKD failed! */
-  CURLFTP_CREATE_DIR_LAST   /* not an option, never use */
-} curl_ftpcreatedir;
-
-/* parameter for the CURLOPT_FTP_FILEMETHOD option */
-typedef enum {
-  CURLFTPMETHOD_DEFAULT,   /* let libcurl pick */
-  CURLFTPMETHOD_MULTICWD,  /* single CWD operation for each path part */
-  CURLFTPMETHOD_NOCWD,     /* no CWD at all */
-  CURLFTPMETHOD_SINGLECWD, /* one CWD to full dir, then work on file */
-  CURLFTPMETHOD_LAST       /* not an option, never use */
-} curl_ftpmethod;
-
-/* CURLPROTO_ defines are for the CURLOPT_*PROTOCOLS options */
-#define CURLPROTO_HTTP   (1<<0)
-#define CURLPROTO_HTTPS  (1<<1)
-#define CURLPROTO_FTP    (1<<2)
-#define CURLPROTO_FTPS   (1<<3)
-#define CURLPROTO_SCP    (1<<4)
-#define CURLPROTO_SFTP   (1<<5)
-#define CURLPROTO_TELNET (1<<6)
-#define CURLPROTO_LDAP   (1<<7)
-#define CURLPROTO_LDAPS  (1<<8)
-#define CURLPROTO_DICT   (1<<9)
-#define CURLPROTO_FILE   (1<<10)
-#define CURLPROTO_TFTP   (1<<11)
-#define CURLPROTO_IMAP   (1<<12)
-#define CURLPROTO_IMAPS  (1<<13)
-#define CURLPROTO_POP3   (1<<14)
-#define CURLPROTO_POP3S  (1<<15)
-#define CURLPROTO_SMTP   (1<<16)
-#define CURLPROTO_SMTPS  (1<<17)
-#define CURLPROTO_RTSP   (1<<18)
-#define CURLPROTO_RTMP   (1<<19)
-#define CURLPROTO_RTMPT  (1<<20)
-#define CURLPROTO_RTMPE  (1<<21)
-#define CURLPROTO_RTMPTE (1<<22)
-#define CURLPROTO_RTMPS  (1<<23)
-#define CURLPROTO_RTMPTS (1<<24)
-#define CURLPROTO_GOPHER (1<<25)
-#define CURLPROTO_ALL    (~0) /* enable everything */
-
-/* long may be 32 or 64 bits, but we should never depend on anything else
-   but 32 */
-#define CURLOPTTYPE_LONG          0
-#define CURLOPTTYPE_OBJECTPOINT   10000
-#define CURLOPTTYPE_FUNCTIONPOINT 20000
-#define CURLOPTTYPE_OFF_T         30000
-
-/* name is uppercase CURLOPT_<name>,
-   type is one of the defined CURLOPTTYPE_<type>
-   number is unique identifier */
-#ifdef CINIT
-#undef CINIT
-#endif
-
-#ifdef CURL_ISOCPP
-#define CINIT(name,type,number) CURLOPT_ ## name = CURLOPTTYPE_ ## type + number
-#else
-/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */
-#define LONG          CURLOPTTYPE_LONG
-#define OBJECTPOINT   CURLOPTTYPE_OBJECTPOINT
-#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT
-#define OFF_T         CURLOPTTYPE_OFF_T
-#define CINIT(name,type,number) CURLOPT_/**/name = type + number
-#endif
-
-/*
- * This macro-mania below setups the CURLOPT_[what] enum, to be used with
- * curl_easy_setopt(). The first argument in the CINIT() macro is the [what]
- * word.
- */
-
-typedef enum {
-  /* This is the FILE * or void * the regular output should be written to. */
-  CINIT(FILE, OBJECTPOINT, 1),
-
-  /* The full URL to get/put */
-  CINIT(URL,  OBJECTPOINT, 2),
-
-  /* Port number to connect to, if other than default. */
-  CINIT(PORT, LONG, 3),
-
-  /* Name of proxy to use. */
-  CINIT(PROXY, OBJECTPOINT, 4),
-
-  /* "name:password" to use when fetching. */
-  CINIT(USERPWD, OBJECTPOINT, 5),
-
-  /* "name:password" to use with proxy. */
-  CINIT(PROXYUSERPWD, OBJECTPOINT, 6),
-
-  /* Range to get, specified as an ASCII string. */
-  CINIT(RANGE, OBJECTPOINT, 7),
-
-  /* not used */
-
-  /* Specified file stream to upload from (use as input): */
-  CINIT(INFILE, OBJECTPOINT, 9),
-
-  /* Buffer to receive error messages in, must be at least CURL_ERROR_SIZE
-   * bytes big. If this is not used, error messages go to stderr instead: */
-  CINIT(ERRORBUFFER, OBJECTPOINT, 10),
-
-  /* Function that will be called to store the output (instead of fwrite). The
-   * parameters will use fwrite() syntax, make sure to follow them. */
-  CINIT(WRITEFUNCTION, FUNCTIONPOINT, 11),
-
-  /* Function that will be called to read the input (instead of fread). The
-   * parameters will use fread() syntax, make sure to follow them. */
-  CINIT(READFUNCTION, FUNCTIONPOINT, 12),
-
-  /* Time-out the read operation after this amount of seconds */
-  CINIT(TIMEOUT, LONG, 13),
-
-  /* If the CURLOPT_INFILE is used, this can be used to inform libcurl about
-   * how large the file being sent really is. That allows better error
-   * checking and better verifies that the upload was successful. -1 means
-   * unknown size.
-   *
-   * For large file support, there is also a _LARGE version of the key
-   * which takes an off_t type, allowing platforms with larger off_t
-   * sizes to handle larger files.  See below for INFILESIZE_LARGE.
-   */
-  CINIT(INFILESIZE, LONG, 14),
-
-  /* POST static input fields. */
-  CINIT(POSTFIELDS, OBJECTPOINT, 15),
-
-  /* Set the referrer page (needed by some CGIs) */
-  CINIT(REFERER, OBJECTPOINT, 16),
-
-  /* Set the FTP PORT string (interface name, named or numerical IP address)
-     Use i.e '-' to use default address. */
-  CINIT(FTPPORT, OBJECTPOINT, 17),
-
-  /* Set the User-Agent string (examined by some CGIs) */
-  CINIT(USERAGENT, OBJECTPOINT, 18),
-
-  /* If the download receives less than "low speed limit" bytes/second
-   * during "low speed time" seconds, the operations is aborted.
-   * You could i.e if you have a pretty high speed connection, abort if
-   * it is less than 2000 bytes/sec during 20 seconds.
-   */
-
-  /* Set the "low speed limit" */
-  CINIT(LOW_SPEED_LIMIT, LONG, 19),
-
-  /* Set the "low speed time" */
-  CINIT(LOW_SPEED_TIME, LONG, 20),
-
-  /* Set the continuation offset.
-   *
-   * Note there is also a _LARGE version of this key which uses
-   * off_t types, allowing for large file offsets on platforms which
-   * use larger-than-32-bit off_t's.  Look below for RESUME_FROM_LARGE.
-   */
-  CINIT(RESUME_FROM, LONG, 21),
-
-  /* Set cookie in request: */
-  CINIT(COOKIE, OBJECTPOINT, 22),
-
-  /* This points to a linked list of headers, struct curl_slist kind */
-  CINIT(HTTPHEADER, OBJECTPOINT, 23),
-
-  /* This points to a linked list of post entries, struct curl_httppost */
-  CINIT(HTTPPOST, OBJECTPOINT, 24),
-
-  /* name of the file keeping your private SSL-certificate */
-  CINIT(SSLCERT, OBJECTPOINT, 25),
-
-  /* password for the SSL or SSH private key */
-  CINIT(KEYPASSWD, OBJECTPOINT, 26),
-
-  /* send TYPE parameter? */
-  CINIT(CRLF, LONG, 27),
-
-  /* send linked-list of QUOTE commands */
-  CINIT(QUOTE, OBJECTPOINT, 28),
-
-  /* send FILE * or void * to store headers to, if you use a callback it
-     is simply passed to the callback unmodified */
-  CINIT(WRITEHEADER, OBJECTPOINT, 29),
-
-  /* point to a file to read the initial cookies from, also enables
-     "cookie awareness" */
-  CINIT(COOKIEFILE, OBJECTPOINT, 31),
-
-  /* What version to specifically try to use.
-     See CURL_SSLVERSION defines below. */
-  CINIT(SSLVERSION, LONG, 32),
-
-  /* What kind of HTTP time condition to use, see defines */
-  CINIT(TIMECONDITION, LONG, 33),
-
-  /* Time to use with the above condition. Specified in number of seconds
-     since 1 Jan 1970 */
-  CINIT(TIMEVALUE, LONG, 34),
-
-  /* 35 = OBSOLETE */
-
-  /* Custom request, for customizing the get command like
-     HTTP: DELETE, TRACE and others
-     FTP: to use a different list command
-     */
-  CINIT(CUSTOMREQUEST, OBJECTPOINT, 36),
-
-  /* HTTP request, for odd commands like DELETE, TRACE and others */
-  CINIT(STDERR, OBJECTPOINT, 37),
-
-  /* 38 is not used */
-
-  /* send linked-list of post-transfer QUOTE commands */
-  CINIT(POSTQUOTE, OBJECTPOINT, 39),
-
-  /* Pass a pointer to string of the output using full variable-replacement
-     as described elsewhere. */
-  CINIT(WRITEINFO, OBJECTPOINT, 40),
-
-  CINIT(VERBOSE, LONG, 41),      /* talk a lot */
-  CINIT(HEADER, LONG, 42),       /* throw the header out too */
-  CINIT(NOPROGRESS, LONG, 43),   /* shut off the progress meter */
-  CINIT(NOBODY, LONG, 44),       /* use HEAD to get http document */
-  CINIT(FAILONERROR, LONG, 45),  /* no output on http error codes >= 300 */
-  CINIT(UPLOAD, LONG, 46),       /* this is an upload */
-  CINIT(POST, LONG, 47),         /* HTTP POST method */
-  CINIT(DIRLISTONLY, LONG, 48),  /* return bare names when listing directories */
-
-  CINIT(APPEND, LONG, 50),       /* Append instead of overwrite on upload! */
-
-  /* Specify whether to read the user+password from the .netrc or the URL.
-   * This must be one of the CURL_NETRC_* enums below. */
-  CINIT(NETRC, LONG, 51),
-
-  CINIT(FOLLOWLOCATION, LONG, 52),  /* use Location: Luke! */
-
-  CINIT(TRANSFERTEXT, LONG, 53), /* transfer data in text/ASCII format */
-  CINIT(PUT, LONG, 54),          /* HTTP PUT */
-
-  /* 55 = OBSOLETE */
-
-  /* Function that will be called instead of the internal progress display
-   * function. This function should be defined as the curl_progress_callback
-   * prototype defines. */
-  CINIT(PROGRESSFUNCTION, FUNCTIONPOINT, 56),
-
-  /* Data passed to the progress callback */
-  CINIT(PROGRESSDATA, OBJECTPOINT, 57),
-
-  /* We want the referrer field set automatically when following locations */
-  CINIT(AUTOREFERER, LONG, 58),
-
-  /* Port of the proxy, can be set in the proxy string as well with:
-     "[host]:[port]" */
-  CINIT(PROXYPORT, LONG, 59),
-
-  /* size of the POST input data, if strlen() is not good to use */
-  CINIT(POSTFIELDSIZE, LONG, 60),
-
-  /* tunnel non-http operations through a HTTP proxy */
-  CINIT(HTTPPROXYTUNNEL, LONG, 61),
-
-  /* Set the interface string to use as outgoing network interface */
-  CINIT(INTERFACE, OBJECTPOINT, 62),
-
-  /* Set the krb4/5 security level, this also enables krb4/5 awareness.  This
-   * is a string, 'clear', 'safe', 'confidential' or 'private'.  If the string
-   * is set but doesn't match one of these, 'private' will be used.  */
-  CINIT(KRBLEVEL, OBJECTPOINT, 63),
-
-  /* Set if we should verify the peer in ssl handshake, set 1 to verify. */
-  CINIT(SSL_VERIFYPEER, LONG, 64),
-
-  /* The CApath or CAfile used to validate the peer certificate
-     this option is used only if SSL_VERIFYPEER is true */
-  CINIT(CAINFO, OBJECTPOINT, 65),
-
-  /* 66 = OBSOLETE */
-  /* 67 = OBSOLETE */
-
-  /* Maximum number of http redirects to follow */
-  CINIT(MAXREDIRS, LONG, 68),
-
-  /* Pass a long set to 1 to get the date of the requested document (if
-     possible)! Pass a zero to shut it off. */
-  CINIT(FILETIME, LONG, 69),
-
-  /* This points to a linked list of telnet options */
-  CINIT(TELNETOPTIONS, OBJECTPOINT, 70),
-
-  /* Max amount of cached alive connections */
-  CINIT(MAXCONNECTS, LONG, 71),
-
-  /* What policy to use when closing connections when the cache is filled
-     up */
-  CINIT(CLOSEPOLICY, LONG, 72),
-
-  /* 73 = OBSOLETE */
-
-  /* Set to explicitly use a new connection for the upcoming transfer.
-     Do not use this unless you're absolutely sure of this, as it makes the
-     operation slower and is less friendly for the network. */
-  CINIT(FRESH_CONNECT, LONG, 74),
-
-  /* Set to explicitly forbid the upcoming transfer's connection to be re-used
-     when done. Do not use this unless you're absolutely sure of this, as it
-     makes the operation slower and is less friendly for the network. */
-  CINIT(FORBID_REUSE, LONG, 75),
-
-  /* Set to a file name that contains random data for libcurl to use to
-     seed the random engine when doing SSL connects. */
-  CINIT(RANDOM_FILE, OBJECTPOINT, 76),
-
-  /* Set to the Entropy Gathering Daemon socket pathname */
-  CINIT(EGDSOCKET, OBJECTPOINT, 77),
-
-  /* Time-out connect operations after this amount of seconds, if connects
-     are OK within this time, then fine... This only aborts the connect
-     phase. [Only works on unix-style/SIGALRM operating systems] */
-  CINIT(CONNECTTIMEOUT, LONG, 78),
-
-  /* Function that will be called to store headers (instead of fwrite). The
-   * parameters will use fwrite() syntax, make sure to follow them. */
-  CINIT(HEADERFUNCTION, FUNCTIONPOINT, 79),
-
-  /* Set this to force the HTTP request to get back to GET. Only really usable
-     if POST, PUT or a custom request have been used first.
-   */
-  CINIT(HTTPGET, LONG, 80),
-
-  /* Set if we should verify the Common name from the peer certificate in ssl
-   * handshake, set 1 to check existence, 2 to ensure that it matches the
-   * provided hostname. */
-  CINIT(SSL_VERIFYHOST, LONG, 81),
-
-  /* Specify which file name to write all known cookies in after completed
-     operation. Set file name to "-" (dash) to make it go to stdout. */
-  CINIT(COOKIEJAR, OBJECTPOINT, 82),
-
-  /* Specify which SSL ciphers to use */
-  CINIT(SSL_CIPHER_LIST, OBJECTPOINT, 83),
-
-  /* Specify which HTTP version to use! This must be set to one of the
-     CURL_HTTP_VERSION* enums set below. */
-  CINIT(HTTP_VERSION, LONG, 84),
-
-  /* Specifically switch on or off the FTP engine's use of the EPSV command. By
-     default, that one will always be attempted before the more traditional
-     PASV command. */
-  CINIT(FTP_USE_EPSV, LONG, 85),
-
-  /* type of the file keeping your SSL-certificate ("DER", "PEM", "ENG") */
-  CINIT(SSLCERTTYPE, OBJECTPOINT, 86),
-
-  /* name of the file keeping your private SSL-key */
-  CINIT(SSLKEY, OBJECTPOINT, 87),
-
-  /* type of the file keeping your private SSL-key ("DER", "PEM", "ENG") */
-  CINIT(SSLKEYTYPE, OBJECTPOINT, 88),
-
-  /* crypto engine for the SSL-sub system */
-  CINIT(SSLENGINE, OBJECTPOINT, 89),
-
-  /* set the crypto engine for the SSL-sub system as default
-     the param has no meaning...
-   */
-  CINIT(SSLENGINE_DEFAULT, LONG, 90),
-
-  /* Non-zero value means to use the global dns cache */
-  CINIT(DNS_USE_GLOBAL_CACHE, LONG, 91), /* To become OBSOLETE soon */
-
-  /* DNS cache timeout */
-  CINIT(DNS_CACHE_TIMEOUT, LONG, 92),
-
-  /* send linked-list of pre-transfer QUOTE commands */
-  CINIT(PREQUOTE, OBJECTPOINT, 93),
-
-  /* set the debug function */
-  CINIT(DEBUGFUNCTION, FUNCTIONPOINT, 94),
-
-  /* set the data for the debug function */
-  CINIT(DEBUGDATA, OBJECTPOINT, 95),
-
-  /* mark this as start of a cookie session */
-  CINIT(COOKIESESSION, LONG, 96),
-
-  /* The CApath directory used to validate the peer certificate
-     this option is used only if SSL_VERIFYPEER is true */
-  CINIT(CAPATH, OBJECTPOINT, 97),
-
-  /* Instruct libcurl to use a smaller receive buffer */
-  CINIT(BUFFERSIZE, LONG, 98),
-
-  /* Instruct libcurl to not use any signal/alarm handlers, even when using
-     timeouts. This option is useful for multi-threaded applications.
-     See libcurl-the-guide for more background information. */
-  CINIT(NOSIGNAL, LONG, 99),
-
-  /* Provide a CURLShare for mutexing non-ts data */
-  CINIT(SHARE, OBJECTPOINT, 100),
-
-  /* indicates type of proxy. accepted values are CURLPROXY_HTTP (default),
-     CURLPROXY_SOCKS4, CURLPROXY_SOCKS4A and CURLPROXY_SOCKS5. */
-  CINIT(PROXYTYPE, LONG, 101),
-
-  /* Set the Accept-Encoding string. Use this to tell a server you would like
-     the response to be compressed. */
-  CINIT(ENCODING, OBJECTPOINT, 102),
-
-  /* Set pointer to private data */
-  CINIT(PRIVATE, OBJECTPOINT, 103),
-
-  /* Set aliases for HTTP 200 in the HTTP Response header */
-  CINIT(HTTP200ALIASES, OBJECTPOINT, 104),
-
-  /* Continue to send authentication (user+password) when following locations,
-     even when hostname changed. This can potentially send off the name
-     and password to whatever host the server decides. */
-  CINIT(UNRESTRICTED_AUTH, LONG, 105),
-
-  /* Specifically switch on or off the FTP engine's use of the EPRT command ( it
-     also disables the LPRT attempt). By default, those ones will always be
-     attempted before the good old traditional PORT command. */
-  CINIT(FTP_USE_EPRT, LONG, 106),
-
-  /* Set this to a bitmask value to enable the particular authentications
-     methods you like. Use this in combination with CURLOPT_USERPWD.
-     Note that setting multiple bits may cause extra network round-trips. */
-  CINIT(HTTPAUTH, LONG, 107),
-
-  /* Set the ssl context callback function, currently only for OpenSSL ssl_ctx
-     in second argument. The function must be matching the
-     curl_ssl_ctx_callback proto. */
-  CINIT(SSL_CTX_FUNCTION, FUNCTIONPOINT, 108),
-
-  /* Set the userdata for the ssl context callback function's third
-     argument */
-  CINIT(SSL_CTX_DATA, OBJECTPOINT, 109),
-
-  /* FTP Option that causes missing dirs to be created on the remote server.
-     In 7.19.4 we introduced the convenience enums for this option using the
-     CURLFTP_CREATE_DIR prefix.
-  */
-  CINIT(FTP_CREATE_MISSING_DIRS, LONG, 110),
-
-  /* Set this to a bitmask value to enable the particular authentications
-     methods you like. Use this in combination with CURLOPT_PROXYUSERPWD.
-     Note that setting multiple bits may cause extra network round-trips. */
-  CINIT(PROXYAUTH, LONG, 111),
-
-  /* FTP option that changes the timeout, in seconds, associated with
-     getting a response.  This is different from transfer timeout time and
-     essentially places a demand on the FTP server to acknowledge commands
-     in a timely manner. */
-  CINIT(FTP_RESPONSE_TIMEOUT, LONG, 112),
-#define CURLOPT_SERVER_RESPONSE_TIMEOUT CURLOPT_FTP_RESPONSE_TIMEOUT
-
-  /* Set this option to one of the CURL_IPRESOLVE_* defines (see below) to
-     tell libcurl to resolve names to those IP versions only. This only has
-     affect on systems with support for more than one, i.e IPv4 _and_ IPv6. */
-  CINIT(IPRESOLVE, LONG, 113),
-
-  /* Set this option to limit the size of a file that will be downloaded from
-     an HTTP or FTP server.
-
-     Note there is also _LARGE version which adds large file support for
-     platforms which have larger off_t sizes.  See MAXFILESIZE_LARGE below. */
-  CINIT(MAXFILESIZE, LONG, 114),
-
-  /* See the comment for INFILESIZE above, but in short, specifies
-   * the size of the file being uploaded.  -1 means unknown.
-   */
-  CINIT(INFILESIZE_LARGE, OFF_T, 115),
-
-  /* Sets the continuation offset.  There is also a LONG version of this;
-   * look above for RESUME_FROM.
-   */
-  CINIT(RESUME_FROM_LARGE, OFF_T, 116),
-
-  /* Sets the maximum size of data that will be downloaded from
-   * an HTTP or FTP server.  See MAXFILESIZE above for the LONG version.
-   */
-  CINIT(MAXFILESIZE_LARGE, OFF_T, 117),
-
-  /* Set this option to the file name of your .netrc file you want libcurl
-     to parse (using the CURLOPT_NETRC option). If not set, libcurl will do
-     a poor attempt to find the user's home directory and check for a .netrc
-     file in there. */
-  CINIT(NETRC_FILE, OBJECTPOINT, 118),
-
-  /* Enable SSL/TLS for FTP, pick one of:
-     CURLFTPSSL_TRY     - try using SSL, proceed anyway otherwise
-     CURLFTPSSL_CONTROL - SSL for the control connection or fail
-     CURLFTPSSL_ALL     - SSL for all communication or fail
-  */
-  CINIT(USE_SSL, LONG, 119),
-
-  /* The _LARGE version of the standard POSTFIELDSIZE option */
-  CINIT(POSTFIELDSIZE_LARGE, OFF_T, 120),
-
-  /* Enable/disable the TCP Nagle algorithm */
-  CINIT(TCP_NODELAY, LONG, 121),
-
-  /* 122 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
-  /* 123 OBSOLETE. Gone in 7.16.0 */
-  /* 124 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
-  /* 125 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
-  /* 126 OBSOLETE, used in 7.12.3. Gone in 7.13.0 */
-  /* 127 OBSOLETE. Gone in 7.16.0 */
-  /* 128 OBSOLETE. Gone in 7.16.0 */
-
-  /* When FTP over SSL/TLS is selected (with CURLOPT_USE_SSL), this option
-     can be used to change libcurl's default action which is to first try
-     "AUTH SSL" and then "AUTH TLS" in this order, and proceed when a OK
-     response has been received.
-
-     Available parameters are:
-     CURLFTPAUTH_DEFAULT - let libcurl decide
-     CURLFTPAUTH_SSL     - try "AUTH SSL" first, then TLS
-     CURLFTPAUTH_TLS     - try "AUTH TLS" first, then SSL
-  */
-  CINIT(FTPSSLAUTH, LONG, 129),
-
-  CINIT(IOCTLFUNCTION, FUNCTIONPOINT, 130),
-  CINIT(IOCTLDATA, OBJECTPOINT, 131),
-
-  /* 132 OBSOLETE. Gone in 7.16.0 */
-  /* 133 OBSOLETE. Gone in 7.16.0 */
-
-  /* zero terminated string for pass on to the FTP server when asked for
-     "account" info */
-  CINIT(FTP_ACCOUNT, OBJECTPOINT, 134),
-
-  /* feed cookies into cookie engine */
-  CINIT(COOKIELIST, OBJECTPOINT, 135),
-
-  /* ignore Content-Length */
-  CINIT(IGNORE_CONTENT_LENGTH, LONG, 136),
-
-  /* Set to non-zero to skip the IP address received in a 227 PASV FTP server
-     response. Typically used for FTP-SSL purposes but is not restricted to
-     that. libcurl will then instead use the same IP address it used for the
-     control connection. */
-  CINIT(FTP_SKIP_PASV_IP, LONG, 137),
-
-  /* Select "file method" to use when doing FTP, see the curl_ftpmethod
-     above. */
-  CINIT(FTP_FILEMETHOD, LONG, 138),
-
-  /* Local port number to bind the socket to */
-  CINIT(LOCALPORT, LONG, 139),
-
-  /* Number of ports to try, including the first one set with LOCALPORT.
-     Thus, setting it to 1 will make no additional attempts but the first.
-  */
-  CINIT(LOCALPORTRANGE, LONG, 140),
-
-  /* no transfer, set up connection and let application use the socket by
-     extracting it with CURLINFO_LASTSOCKET */
-  CINIT(CONNECT_ONLY, LONG, 141),
-
-  /* Function that will be called to convert from the
-     network encoding (instead of using the iconv calls in libcurl) */
-  CINIT(CONV_FROM_NETWORK_FUNCTION, FUNCTIONPOINT, 142),
-
-  /* Function that will be called to convert to the
-     network encoding (instead of using the iconv calls in libcurl) */
-  CINIT(CONV_TO_NETWORK_FUNCTION, FUNCTIONPOINT, 143),
-
-  /* Function that will be called to convert from UTF8
-     (instead of using the iconv calls in libcurl)
-     Note that this is used only for SSL certificate processing */
-  CINIT(CONV_FROM_UTF8_FUNCTION, FUNCTIONPOINT, 144),
-
-  /* if the connection proceeds too quickly then need to slow it down */
-  /* limit-rate: maximum number of bytes per second to send or receive */
-  CINIT(MAX_SEND_SPEED_LARGE, OFF_T, 145),
-  CINIT(MAX_RECV_SPEED_LARGE, OFF_T, 146),
-
-  /* Pointer to command string to send if USER/PASS fails. */
-  CINIT(FTP_ALTERNATIVE_TO_USER, OBJECTPOINT, 147),
-
-  /* callback function for setting socket options */
-  CINIT(SOCKOPTFUNCTION, FUNCTIONPOINT, 148),
-  CINIT(SOCKOPTDATA, OBJECTPOINT, 149),
-
-  /* set to 0 to disable session ID re-use for this transfer, default is
-     enabled (== 1) */
-  CINIT(SSL_SESSIONID_CACHE, LONG, 150),
-
-  /* allowed SSH authentication methods */
-  CINIT(SSH_AUTH_TYPES, LONG, 151),
-
-  /* Used by scp/sftp to do public/private key authentication */
-  CINIT(SSH_PUBLIC_KEYFILE, OBJECTPOINT, 152),
-  CINIT(SSH_PRIVATE_KEYFILE, OBJECTPOINT, 153),
-
-  /* Send CCC (Clear Command Channel) after authentication */
-  CINIT(FTP_SSL_CCC, LONG, 154),
-
-  /* Same as TIMEOUT and CONNECTTIMEOUT, but with ms resolution */
-  CINIT(TIMEOUT_MS, LONG, 155),
-  CINIT(CONNECTTIMEOUT_MS, LONG, 156),
-
-  /* set to zero to disable the libcurl's decoding and thus pass the raw body
-     data to the application even when it is encoded/compressed */
-  CINIT(HTTP_TRANSFER_DECODING, LONG, 157),
-  CINIT(HTTP_CONTENT_DECODING, LONG, 158),
-
-  /* Permission used when creating new files and directories on the remote
-     server for protocols that support it, SFTP/SCP/FILE */
-  CINIT(NEW_FILE_PERMS, LONG, 159),
-  CINIT(NEW_DIRECTORY_PERMS, LONG, 160),
-
-  /* Set the behaviour of POST when redirecting. Values must be set to one
-     of CURL_REDIR* defines below. This used to be called CURLOPT_POST301 */
-  CINIT(POSTREDIR, LONG, 161),
-
-  /* used by scp/sftp to verify the host's public key */
-  CINIT(SSH_HOST_PUBLIC_KEY_MD5, OBJECTPOINT, 162),
-
-  /* Callback function for opening socket (instead of socket(2)). Optionally,
-     callback is able change the address or refuse to connect returning
-     CURL_SOCKET_BAD.  The callback should have type
-     curl_opensocket_callback */
-  CINIT(OPENSOCKETFUNCTION, FUNCTIONPOINT, 163),
-  CINIT(OPENSOCKETDATA, OBJECTPOINT, 164),
-
-  /* POST volatile input fields. */
-  CINIT(COPYPOSTFIELDS, OBJECTPOINT, 165),
-
-  /* set transfer mode (;type=<a|i>) when doing FTP via an HTTP proxy */
-  CINIT(PROXY_TRANSFER_MODE, LONG, 166),
-
-  /* Callback function for seeking in the input stream */
-  CINIT(SEEKFUNCTION, FUNCTIONPOINT, 167),
-  CINIT(SEEKDATA, OBJECTPOINT, 168),
-
-  /* CRL file */
-  CINIT(CRLFILE, OBJECTPOINT, 169),
-
-  /* Issuer certificate */
-  CINIT(ISSUERCERT, OBJECTPOINT, 170),
-
-  /* (IPv6) Address scope */
-  CINIT(ADDRESS_SCOPE, LONG, 171),
-
-  /* Collect certificate chain info and allow it to get retrievable with
-     CURLINFO_CERTINFO after the transfer is complete. (Unfortunately) only
-     working with OpenSSL-powered builds. */
-  CINIT(CERTINFO, LONG, 172),
-
-  /* "name" and "pwd" to use when fetching. */
-  CINIT(USERNAME, OBJECTPOINT, 173),
-  CINIT(PASSWORD, OBJECTPOINT, 174),
-
-    /* "name" and "pwd" to use with Proxy when fetching. */
-  CINIT(PROXYUSERNAME, OBJECTPOINT, 175),
-  CINIT(PROXYPASSWORD, OBJECTPOINT, 176),
-
-  /* Comma separated list of hostnames defining no-proxy zones. These should
-     match both hostnames directly, and hostnames within a domain. For
-     example, local.com will match local.com and www.local.com, but NOT
-     notlocal.com or www.notlocal.com. For compatibility with other
-     implementations of this, .local.com will be considered to be the same as
-     local.com. A single * is the only valid wildcard, and effectively
-     disables the use of proxy. */
-  CINIT(NOPROXY, OBJECTPOINT, 177),
-
-  /* block size for TFTP transfers */
-  CINIT(TFTP_BLKSIZE, LONG, 178),
-
-  /* Socks Service */
-  CINIT(SOCKS5_GSSAPI_SERVICE, OBJECTPOINT, 179),
-
-  /* Socks Service */
-  CINIT(SOCKS5_GSSAPI_NEC, LONG, 180),
-
-  /* set the bitmask for the protocols that are allowed to be used for the
-     transfer, which thus helps the app which takes URLs from users or other
-     external inputs and want to restrict what protocol(s) to deal
-     with. Defaults to CURLPROTO_ALL. */
-  CINIT(PROTOCOLS, LONG, 181),
-
-  /* set the bitmask for the protocols that libcurl is allowed to follow to,
-     as a subset of the CURLOPT_PROTOCOLS ones. That means the protocol needs
-     to be set in both bitmasks to be allowed to get redirected to. Defaults
-     to all protocols except FILE and SCP. */
-  CINIT(REDIR_PROTOCOLS, LONG, 182),
-
-  /* set the SSH knownhost file name to use */
-  CINIT(SSH_KNOWNHOSTS, OBJECTPOINT, 183),
-
-  /* set the SSH host key callback, must point to a curl_sshkeycallback
-     function */
-  CINIT(SSH_KEYFUNCTION, FUNCTIONPOINT, 184),
-
-  /* set the SSH host key callback custom pointer */
-  CINIT(SSH_KEYDATA, OBJECTPOINT, 185),
-
-  /* set the SMTP mail originator */
-  CINIT(MAIL_FROM, OBJECTPOINT, 186),
-
-  /* set the SMTP mail receiver(s) */
-  CINIT(MAIL_RCPT, OBJECTPOINT, 187),
-
-  /* FTP: send PRET before PASV */
-  CINIT(FTP_USE_PRET, LONG, 188),
-
-  /* RTSP request method (OPTIONS, SETUP, PLAY, etc...) */
-  CINIT(RTSP_REQUEST, LONG, 189),
-
-  /* The RTSP session identifier */
-  CINIT(RTSP_SESSION_ID, OBJECTPOINT, 190),
-
-  /* The RTSP stream URI */
-  CINIT(RTSP_STREAM_URI, OBJECTPOINT, 191),
-
-  /* The Transport: header to use in RTSP requests */
-  CINIT(RTSP_TRANSPORT, OBJECTPOINT, 192),
-
-  /* Manually initialize the client RTSP CSeq for this handle */
-  CINIT(RTSP_CLIENT_CSEQ, LONG, 193),
-
-  /* Manually initialize the server RTSP CSeq for this handle */
-  CINIT(RTSP_SERVER_CSEQ, LONG, 194),
-
-  /* The stream to pass to INTERLEAVEFUNCTION. */
-  CINIT(INTERLEAVEDATA, OBJECTPOINT, 195),
-
-  /* Let the application define a custom write method for RTP data */
-  CINIT(INTERLEAVEFUNCTION, FUNCTIONPOINT, 196),
-
-  /* Turn on wildcard matching */
-  CINIT(WILDCARDMATCH, LONG, 197),
-
-  /* Directory matching callback called before downloading of an
-     individual file (chunk) started */
-  CINIT(CHUNK_BGN_FUNCTION, FUNCTIONPOINT, 198),
-
-  /* Directory matching callback called after the file (chunk)
-     was downloaded, or skipped */
-  CINIT(CHUNK_END_FUNCTION, FUNCTIONPOINT, 199),
-
-  /* Change match (fnmatch-like) callback for wildcard matching */
-  CINIT(FNMATCH_FUNCTION, FUNCTIONPOINT, 200),
-
-  /* Let the application define custom chunk data pointer */
-  CINIT(CHUNK_DATA, OBJECTPOINT, 201),
-
-  /* FNMATCH_FUNCTION user pointer */
-  CINIT(FNMATCH_DATA, OBJECTPOINT, 202),
-
-  CURLOPT_LASTENTRY /* the last unused */
-} CURLoption;
-
-#ifndef CURL_NO_OLDIES /* define this to test if your app builds with all
-                          the obsolete stuff removed! */
-
-/* Backwards compatibility with older names */
-/* These are scheduled to disappear by 2011 */
-
-/* This was added in version 7.19.1 */
-#define CURLOPT_POST301 CURLOPT_POSTREDIR
-
-/* These are scheduled to disappear by 2009 */
-
-/* The following were added in 7.17.0 */
-#define CURLOPT_SSLKEYPASSWD CURLOPT_KEYPASSWD
-#define CURLOPT_FTPAPPEND CURLOPT_APPEND
-#define CURLOPT_FTPLISTONLY CURLOPT_DIRLISTONLY
-#define CURLOPT_FTP_SSL CURLOPT_USE_SSL
-
-/* The following were added earlier */
-
-#define CURLOPT_SSLCERTPASSWD CURLOPT_KEYPASSWD
-#define CURLOPT_KRB4LEVEL CURLOPT_KRBLEVEL
-
-#else
-/* This is set if CURL_NO_OLDIES is defined at compile-time */
-#undef CURLOPT_DNS_USE_GLOBAL_CACHE /* soon obsolete */
-#endif
-
-
-  /* Below here follows defines for the CURLOPT_IPRESOLVE option. If a host
-     name resolves addresses using more than one IP protocol version, this
-     option might be handy to force libcurl to use a specific IP version. */
-#define CURL_IPRESOLVE_WHATEVER 0 /* default, resolves addresses to all IP
-                                     versions that your system allows */
-#define CURL_IPRESOLVE_V4       1 /* resolve to ipv4 addresses */
-#define CURL_IPRESOLVE_V6       2 /* resolve to ipv6 addresses */
-
-  /* three convenient "aliases" that follow the name scheme better */
-#define CURLOPT_WRITEDATA CURLOPT_FILE
-#define CURLOPT_READDATA  CURLOPT_INFILE
-#define CURLOPT_HEADERDATA CURLOPT_WRITEHEADER
-#define CURLOPT_RTSPHEADER CURLOPT_HTTPHEADER
-
-  /* These enums are for use with the CURLOPT_HTTP_VERSION option. */
-enum {
-  CURL_HTTP_VERSION_NONE, /* setting this means we don't care, and that we'd
-                             like the library to choose the best possible
-                             for us! */
-  CURL_HTTP_VERSION_1_0,  /* please use HTTP 1.0 in the request */
-  CURL_HTTP_VERSION_1_1,  /* please use HTTP 1.1 in the request */
-
-  CURL_HTTP_VERSION_LAST /* *ILLEGAL* http version */
-};
-
-/*
- * Public API enums for RTSP requests
- */
-enum {
-    CURL_RTSPREQ_NONE, /* first in list */
-    CURL_RTSPREQ_OPTIONS,
-    CURL_RTSPREQ_DESCRIBE,
-    CURL_RTSPREQ_ANNOUNCE,
-    CURL_RTSPREQ_SETUP,
-    CURL_RTSPREQ_PLAY,
-    CURL_RTSPREQ_PAUSE,
-    CURL_RTSPREQ_TEARDOWN,
-    CURL_RTSPREQ_GET_PARAMETER,
-    CURL_RTSPREQ_SET_PARAMETER,
-    CURL_RTSPREQ_RECORD,
-    CURL_RTSPREQ_RECEIVE,
-    CURL_RTSPREQ_LAST /* last in list */
-};
-
-  /* These enums are for use with the CURLOPT_NETRC option. */
-enum CURL_NETRC_OPTION {
-  CURL_NETRC_IGNORED,     /* The .netrc will never be read.
-                           * This is the default. */
-  CURL_NETRC_OPTIONAL,    /* A user:password in the URL will be preferred
-                           * to one in the .netrc. */
-  CURL_NETRC_REQUIRED,    /* A user:password in the URL will be ignored.
-                           * Unless one is set programmatically, the .netrc
-                           * will be queried. */
-  CURL_NETRC_LAST
-};
-
-enum {
-  CURL_SSLVERSION_DEFAULT,
-  CURL_SSLVERSION_TLSv1,
-  CURL_SSLVERSION_SSLv2,
-  CURL_SSLVERSION_SSLv3,
-
-  CURL_SSLVERSION_LAST /* never use, keep last */
-};
-
-/* symbols to use with CURLOPT_POSTREDIR.
-   CURL_REDIR_POST_301 and CURL_REDIR_POST_302 can be bitwise ORed so that
-   CURL_REDIR_POST_301 | CURL_REDIR_POST_302 == CURL_REDIR_POST_ALL */
-
-#define CURL_REDIR_GET_ALL  0
-#define CURL_REDIR_POST_301 1
-#define CURL_REDIR_POST_302 2
-#define CURL_REDIR_POST_ALL (CURL_REDIR_POST_301|CURL_REDIR_POST_302)
-
-typedef enum {
-  CURL_TIMECOND_NONE,
-
-  CURL_TIMECOND_IFMODSINCE,
-  CURL_TIMECOND_IFUNMODSINCE,
-  CURL_TIMECOND_LASTMOD,
-
-  CURL_TIMECOND_LAST
-} curl_TimeCond;
-
-
-/* curl_strequal() and curl_strnequal() are subject for removal in a future
-   libcurl, see lib/README.curlx for details */
-CURL_EXTERN int (curl_strequal)(const char *s1, const char *s2);
-CURL_EXTERN int (curl_strnequal)(const char *s1, const char *s2, size_t n);
-
-/* name is uppercase CURLFORM_<name> */
-#ifdef CFINIT
-#undef CFINIT
-#endif
-
-#ifdef CURL_ISOCPP
-#define CFINIT(name) CURLFORM_ ## name
-#else
-/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */
-#define CFINIT(name) CURLFORM_/**/name
-#endif
-
-typedef enum {
-  CFINIT(NOTHING),        /********* the first one is unused ************/
-
-  /*  */
-  CFINIT(COPYNAME),
-  CFINIT(PTRNAME),
-  CFINIT(NAMELENGTH),
-  CFINIT(COPYCONTENTS),
-  CFINIT(PTRCONTENTS),
-  CFINIT(CONTENTSLENGTH),
-  CFINIT(FILECONTENT),
-  CFINIT(ARRAY),
-  CFINIT(OBSOLETE),
-  CFINIT(FILE),
-
-  CFINIT(BUFFER),
-  CFINIT(BUFFERPTR),
-  CFINIT(BUFFERLENGTH),
-
-  CFINIT(CONTENTTYPE),
-  CFINIT(CONTENTHEADER),
-  CFINIT(FILENAME),
-  CFINIT(END),
-  CFINIT(OBSOLETE2),
-
-  CFINIT(STREAM),
-
-  CURLFORM_LASTENTRY /* the last unused */
-} CURLformoption;
-
-#undef CFINIT /* done */
-
-/* structure to be used as parameter for CURLFORM_ARRAY */
-struct curl_forms {
-  CURLformoption option;
-  const char     *value;
-};
-
-/* use this for multipart formpost building */
-/* Returns code for curl_formadd()
- *
- * Returns:
- * CURL_FORMADD_OK             on success
- * CURL_FORMADD_MEMORY         if the FormInfo allocation fails
- * CURL_FORMADD_OPTION_TWICE   if one option is given twice for one Form
- * CURL_FORMADD_NULL           if a null pointer was given for a char
- * CURL_FORMADD_MEMORY         if the allocation of a FormInfo struct failed
- * CURL_FORMADD_UNKNOWN_OPTION if an unknown option was used
- * CURL_FORMADD_INCOMPLETE     if the some FormInfo is not complete (or error)
- * CURL_FORMADD_MEMORY         if a curl_httppost struct cannot be allocated
- * CURL_FORMADD_MEMORY         if some allocation for string copying failed.
- * CURL_FORMADD_ILLEGAL_ARRAY  if an illegal option is used in an array
- *
- ***************************************************************************/
-typedef enum {
-  CURL_FORMADD_OK, /* first, no error */
-
-  CURL_FORMADD_MEMORY,
-  CURL_FORMADD_OPTION_TWICE,
-  CURL_FORMADD_NULL,
-  CURL_FORMADD_UNKNOWN_OPTION,
-  CURL_FORMADD_INCOMPLETE,
-  CURL_FORMADD_ILLEGAL_ARRAY,
-  CURL_FORMADD_DISABLED, /* libcurl was built with this disabled */
-
-  CURL_FORMADD_LAST /* last */
-} CURLFORMcode;
-
-/*
- * NAME curl_formadd()
- *
- * DESCRIPTION
- *
- * Pretty advanced function for building multi-part formposts. Each invoke
- * adds one part that together construct a full post. Then use
- * CURLOPT_HTTPPOST to send it off to libcurl.
- */
-CURL_EXTERN CURLFORMcode curl_formadd(struct curl_httppost **httppost,
-                                      struct curl_httppost **last_post,
-                                      ...);
-
-/*
- * callback function for curl_formget()
- * The void *arg pointer will be the one passed as second argument to
- *   curl_formget().
- * The character buffer passed to it must not be freed.
- * Should return the buffer length passed to it as the argument "len" on
- *   success.
- */
-typedef size_t (*curl_formget_callback)(void *arg, const char *buf, size_t len);
-
-/*
- * NAME curl_formget()
- *
- * DESCRIPTION
- *
- * Serialize a curl_httppost struct built with curl_formadd().
- * Accepts a void pointer as second argument which will be passed to
- * the curl_formget_callback function.
- * Returns 0 on success.
- */
-CURL_EXTERN int curl_formget(struct curl_httppost *form, void *arg,
-                             curl_formget_callback append);
-/*
- * NAME curl_formfree()
- *
- * DESCRIPTION
- *
- * Free a multipart formpost previously built with curl_formadd().
- */
-CURL_EXTERN void curl_formfree(struct curl_httppost *form);
-
-/*
- * NAME curl_getenv()
- *
- * DESCRIPTION
- *
- * Returns a malloc()'ed string that MUST be curl_free()ed after usage is
- * complete. DEPRECATED - see lib/README.curlx
- */
-CURL_EXTERN char *curl_getenv(const char *variable);
-
-/*
- * NAME curl_version()
- *
- * DESCRIPTION
- *
- * Returns a static ascii string of the libcurl version.
- */
-CURL_EXTERN char *curl_version(void);
-
-/*
- * NAME curl_easy_escape()
- *
- * DESCRIPTION
- *
- * Escapes URL strings (converts all letters consider illegal in URLs to their
- * %XX versions). This function returns a new allocated string or NULL if an
- * error occurred.
- */
-CURL_EXTERN char *curl_easy_escape(CURL *handle,
-                                   const char *string,
-                                   int length);
-
-/* the previous version: */
-CURL_EXTERN char *curl_escape(const char *string,
-                              int length);
-
-
-/*
- * NAME curl_easy_unescape()
- *
- * DESCRIPTION
- *
- * Unescapes URL encoding in strings (converts all %XX codes to their 8bit
- * versions). This function returns a new allocated string or NULL if an error
- * occurred.
- * Conversion Note: On non-ASCII platforms the ASCII %XX codes are
- * converted into the host encoding.
- */
-CURL_EXTERN char *curl_easy_unescape(CURL *handle,
-                                     const char *string,
-                                     int length,
-                                     int *outlength);
-
-/* the previous version */
-CURL_EXTERN char *curl_unescape(const char *string,
-                                int length);
-
-/*
- * NAME curl_free()
- *
- * DESCRIPTION
- *
- * Provided for de-allocation in the same translation unit that did the
- * allocation. Added in libcurl 7.10
- */
-CURL_EXTERN void curl_free(void *p);
-
-/*
- * NAME curl_global_init()
- *
- * DESCRIPTION
- *
- * curl_global_init() should be invoked exactly once for each application that
- * uses libcurl and before any call of other libcurl functions.
- *
- * This function is not thread-safe!
- */
-CURL_EXTERN CURLcode curl_global_init(long flags);
-
-/*
- * NAME curl_global_init_mem()
- *
- * DESCRIPTION
- *
- * curl_global_init() or curl_global_init_mem() should be invoked exactly once
- * for each application that uses libcurl.  This function can be used to
- * initialize libcurl and set user defined memory management callback
- * functions.  Users can implement memory management routines to check for
- * memory leaks, check for mis-use of the curl library etc.  User registered
- * callback routines with be invoked by this library instead of the system
- * memory management routines like malloc, free etc.
- */
-CURL_EXTERN CURLcode curl_global_init_mem(long flags,
-                                          curl_malloc_callback m,
-                                          curl_free_callback f,
-                                          curl_realloc_callback r,
-                                          curl_strdup_callback s,
-                                          curl_calloc_callback c);
-
-/*
- * NAME curl_global_cleanup()
- *
- * DESCRIPTION
- *
- * curl_global_cleanup() should be invoked exactly once for each application
- * that uses libcurl
- */
-CURL_EXTERN void curl_global_cleanup(void);
-
-/* linked-list structure for the CURLOPT_QUOTE option (and other) */
-struct curl_slist {
-  char *data;
-  struct curl_slist *next;
-};
-
-/*
- * NAME curl_slist_append()
- *
- * DESCRIPTION
- *
- * Appends a string to a linked list. If no list exists, it will be created
- * first. Returns the new list, after appending.
- */
-CURL_EXTERN struct curl_slist *curl_slist_append(struct curl_slist *,
-                                                 const char *);
-
-/*
- * NAME curl_slist_free_all()
- *
- * DESCRIPTION
- *
- * free a previously built curl_slist.
- */
-CURL_EXTERN void curl_slist_free_all(struct curl_slist *);
-
-/*
- * NAME curl_getdate()
- *
- * DESCRIPTION
- *
- * Returns the time, in seconds since 1 Jan 1970 of the time string given in
- * the first argument. The time argument in the second parameter is unused
- * and should be set to NULL.
- */
-CURL_EXTERN time_t curl_getdate(const char *p, const time_t *unused);
-
-/* info about the certificate chain, only for OpenSSL builds. Asked
-   for with CURLOPT_CERTINFO / CURLINFO_CERTINFO */
-struct curl_certinfo {
-  int num_of_certs;             /* number of certificates with information */
-  struct curl_slist **certinfo; /* for each index in this array, there's a
-                                   linked list with textual information in the
-                                   format "name: value" */
-};
-
-#define CURLINFO_STRING   0x100000
-#define CURLINFO_LONG     0x200000
-#define CURLINFO_DOUBLE   0x300000
-#define CURLINFO_SLIST    0x400000
-#define CURLINFO_MASK     0x0fffff
-#define CURLINFO_TYPEMASK 0xf00000
-
-typedef enum {
-  CURLINFO_NONE, /* first, never use this */
-  CURLINFO_EFFECTIVE_URL    = CURLINFO_STRING + 1,
-  CURLINFO_RESPONSE_CODE    = CURLINFO_LONG   + 2,
-  CURLINFO_TOTAL_TIME       = CURLINFO_DOUBLE + 3,
-  CURLINFO_NAMELOOKUP_TIME  = CURLINFO_DOUBLE + 4,
-  CURLINFO_CONNECT_TIME     = CURLINFO_DOUBLE + 5,
-  CURLINFO_PRETRANSFER_TIME = CURLINFO_DOUBLE + 6,
-  CURLINFO_SIZE_UPLOAD      = CURLINFO_DOUBLE + 7,
-  CURLINFO_SIZE_DOWNLOAD    = CURLINFO_DOUBLE + 8,
-  CURLINFO_SPEED_DOWNLOAD   = CURLINFO_DOUBLE + 9,
-  CURLINFO_SPEED_UPLOAD     = CURLINFO_DOUBLE + 10,
-  CURLINFO_HEADER_SIZE      = CURLINFO_LONG   + 11,
-  CURLINFO_REQUEST_SIZE     = CURLINFO_LONG   + 12,
-  CURLINFO_SSL_VERIFYRESULT = CURLINFO_LONG   + 13,
-  CURLINFO_FILETIME         = CURLINFO_LONG   + 14,
-  CURLINFO_CONTENT_LENGTH_DOWNLOAD   = CURLINFO_DOUBLE + 15,
-  CURLINFO_CONTENT_LENGTH_UPLOAD     = CURLINFO_DOUBLE + 16,
-  CURLINFO_STARTTRANSFER_TIME = CURLINFO_DOUBLE + 17,
-  CURLINFO_CONTENT_TYPE     = CURLINFO_STRING + 18,
-  CURLINFO_REDIRECT_TIME    = CURLINFO_DOUBLE + 19,
-  CURLINFO_REDIRECT_COUNT   = CURLINFO_LONG   + 20,
-  CURLINFO_PRIVATE          = CURLINFO_STRING + 21,
-  CURLINFO_HTTP_CONNECTCODE = CURLINFO_LONG   + 22,
-  CURLINFO_HTTPAUTH_AVAIL   = CURLINFO_LONG   + 23,
-  CURLINFO_PROXYAUTH_AVAIL  = CURLINFO_LONG   + 24,
-  CURLINFO_OS_ERRNO         = CURLINFO_LONG   + 25,
-  CURLINFO_NUM_CONNECTS     = CURLINFO_LONG   + 26,
-  CURLINFO_SSL_ENGINES      = CURLINFO_SLIST  + 27,
-  CURLINFO_COOKIELIST       = CURLINFO_SLIST  + 28,
-  CURLINFO_LASTSOCKET       = CURLINFO_LONG   + 29,
-  CURLINFO_FTP_ENTRY_PATH   = CURLINFO_STRING + 30,
-  CURLINFO_REDIRECT_URL     = CURLINFO_STRING + 31,
-  CURLINFO_PRIMARY_IP       = CURLINFO_STRING + 32,
-  CURLINFO_APPCONNECT_TIME  = CURLINFO_DOUBLE + 33,
-  CURLINFO_CERTINFO         = CURLINFO_SLIST  + 34,
-  CURLINFO_CONDITION_UNMET  = CURLINFO_LONG   + 35,
-  CURLINFO_RTSP_SESSION_ID  = CURLINFO_STRING + 36,
-  CURLINFO_RTSP_CLIENT_CSEQ = CURLINFO_LONG   + 37,
-  CURLINFO_RTSP_SERVER_CSEQ = CURLINFO_LONG   + 38,
-  CURLINFO_RTSP_CSEQ_RECV   = CURLINFO_LONG   + 39,
-  CURLINFO_PRIMARY_PORT     = CURLINFO_LONG   + 40,
-  CURLINFO_LOCAL_IP         = CURLINFO_STRING + 41,
-  CURLINFO_LOCAL_PORT       = CURLINFO_LONG   + 42,
-  /* Fill in new entries below here! */
-
-  CURLINFO_LASTONE          = 42
-} CURLINFO;
-
-/* CURLINFO_RESPONSE_CODE is the new name for the option previously known as
-   CURLINFO_HTTP_CODE */
-#define CURLINFO_HTTP_CODE CURLINFO_RESPONSE_CODE
-
-typedef enum {
-  CURLCLOSEPOLICY_NONE, /* first, never use this */
-
-  CURLCLOSEPOLICY_OLDEST,
-  CURLCLOSEPOLICY_LEAST_RECENTLY_USED,
-  CURLCLOSEPOLICY_LEAST_TRAFFIC,
-  CURLCLOSEPOLICY_SLOWEST,
-  CURLCLOSEPOLICY_CALLBACK,
-
-  CURLCLOSEPOLICY_LAST /* last, never use this */
-} curl_closepolicy;
-
-#define CURL_GLOBAL_SSL (1<<0)
-#define CURL_GLOBAL_WIN32 (1<<1)
-#define CURL_GLOBAL_ALL (CURL_GLOBAL_SSL|CURL_GLOBAL_WIN32)
-#define CURL_GLOBAL_NOTHING 0
-#define CURL_GLOBAL_DEFAULT CURL_GLOBAL_ALL
-
-
-/*****************************************************************************
- * Setup defines, protos etc for the sharing stuff.
- */
-
-/* Different data locks for a single share */
-typedef enum {
-  CURL_LOCK_DATA_NONE = 0,
-  /*  CURL_LOCK_DATA_SHARE is used internally to say that
-   *  the locking is just made to change the internal state of the share
-   *  itself.
-   */
-  CURL_LOCK_DATA_SHARE,
-  CURL_LOCK_DATA_COOKIE,
-  CURL_LOCK_DATA_DNS,
-  CURL_LOCK_DATA_SSL_SESSION,
-  CURL_LOCK_DATA_CONNECT,
-  CURL_LOCK_DATA_LAST
-} curl_lock_data;
-
-/* Different lock access types */
-typedef enum {
-  CURL_LOCK_ACCESS_NONE = 0,   /* unspecified action */
-  CURL_LOCK_ACCESS_SHARED = 1, /* for read perhaps */
-  CURL_LOCK_ACCESS_SINGLE = 2, /* for write perhaps */
-  CURL_LOCK_ACCESS_LAST        /* never use */
-} curl_lock_access;
-
-typedef void (*curl_lock_function)(CURL *handle,
-                                   curl_lock_data data,
-                                   curl_lock_access locktype,
-                                   void *userptr);
-typedef void (*curl_unlock_function)(CURL *handle,
-                                     curl_lock_data data,
-                                     void *userptr);
-
-typedef void CURLSH;
-
-typedef enum {
-  CURLSHE_OK,  /* all is fine */
-  CURLSHE_BAD_OPTION, /* 1 */
-  CURLSHE_IN_USE,     /* 2 */
-  CURLSHE_INVALID,    /* 3 */
-  CURLSHE_NOMEM,      /* out of memory */
-  CURLSHE_LAST /* never use */
-} CURLSHcode;
-
-typedef enum {
-  CURLSHOPT_NONE,  /* don't use */
-  CURLSHOPT_SHARE,   /* specify a data type to share */
-  CURLSHOPT_UNSHARE, /* specify which data type to stop sharing */
-  CURLSHOPT_LOCKFUNC,   /* pass in a 'curl_lock_function' pointer */
-  CURLSHOPT_UNLOCKFUNC, /* pass in a 'curl_unlock_function' pointer */
-  CURLSHOPT_USERDATA,   /* pass in a user data pointer used in the lock/unlock
-                           callback functions */
-  CURLSHOPT_LAST  /* never use */
-} CURLSHoption;
-
-CURL_EXTERN CURLSH *curl_share_init(void);
-CURL_EXTERN CURLSHcode curl_share_setopt(CURLSH *, CURLSHoption option, ...);
-CURL_EXTERN CURLSHcode curl_share_cleanup(CURLSH *);
-
-/****************************************************************************
- * Structures for querying information about the curl library at runtime.
- */
-
-typedef enum {
-  CURLVERSION_FIRST,
-  CURLVERSION_SECOND,
-  CURLVERSION_THIRD,
-  CURLVERSION_FOURTH,
-  CURLVERSION_LAST /* never actually use this */
-} CURLversion;
-
-/* The 'CURLVERSION_NOW' is the symbolic name meant to be used by
-   basically all programs ever that want to get version information. It is
-   meant to be a built-in version number for what kind of struct the caller
-   expects. If the struct ever changes, we redefine the NOW to another enum
-   from above. */
-#define CURLVERSION_NOW CURLVERSION_FOURTH
-
-typedef struct {
-  CURLversion age;          /* age of the returned struct */
-  const char *version;      /* LIBCURL_VERSION */
-  unsigned int version_num; /* LIBCURL_VERSION_NUM */
-  const char *host;         /* OS/host/cpu/machine when configured */
-  int features;             /* bitmask, see defines below */
-  const char *ssl_version;  /* human readable string */
-  long ssl_version_num;     /* not used anymore, always 0 */
-  const char *libz_version; /* human readable string */
-  /* protocols is terminated by an entry with a NULL protoname */
-  const char * const *protocols;
-
-  /* The fields below this were added in CURLVERSION_SECOND */
-  const char *ares;
-  int ares_num;
-
-  /* This field was added in CURLVERSION_THIRD */
-  const char *libidn;
-
-  /* These field were added in CURLVERSION_FOURTH */
-
-  /* Same as '_libiconv_version' if built with HAVE_ICONV */
-  int iconv_ver_num;
-
-  const char *libssh_version; /* human readable string */
-
-} curl_version_info_data;
-
-#define CURL_VERSION_IPV6      (1<<0)  /* IPv6-enabled */
-#define CURL_VERSION_KERBEROS4 (1<<1)  /* kerberos auth is supported */
-#define CURL_VERSION_SSL       (1<<2)  /* SSL options are present */
-#define CURL_VERSION_LIBZ      (1<<3)  /* libz features are present */
-#define CURL_VERSION_NTLM      (1<<4)  /* NTLM auth is supported */
-#define CURL_VERSION_GSSNEGOTIATE (1<<5) /* Negotiate auth support */
-#define CURL_VERSION_DEBUG     (1<<6)  /* built with debug capabilities */
-#define CURL_VERSION_ASYNCHDNS (1<<7)  /* asynchronous dns resolves */
-#define CURL_VERSION_SPNEGO    (1<<8)  /* SPNEGO auth */
-#define CURL_VERSION_LARGEFILE (1<<9)  /* supports files bigger than 2GB */
-#define CURL_VERSION_IDN       (1<<10) /* International Domain Names support */
-#define CURL_VERSION_SSPI      (1<<11) /* SSPI is supported */
-#define CURL_VERSION_CONV      (1<<12) /* character conversions supported */
-#define CURL_VERSION_CURLDEBUG (1<<13) /* debug memory tracking supported */
-
-/*
- * NAME curl_version_info()
- *
- * DESCRIPTION
- *
- * This function returns a pointer to a static copy of the version info
- * struct. See above.
- */
-CURL_EXTERN curl_version_info_data *curl_version_info(CURLversion);
-
-/*
- * NAME curl_easy_strerror()
- *
- * DESCRIPTION
- *
- * The curl_easy_strerror function may be used to turn a CURLcode value
- * into the equivalent human readable error string.  This is useful
- * for printing meaningful error messages.
- */
-CURL_EXTERN const char *curl_easy_strerror(CURLcode);
-
-/*
- * NAME curl_share_strerror()
- *
- * DESCRIPTION
- *
- * The curl_share_strerror function may be used to turn a CURLSHcode value
- * into the equivalent human readable error string.  This is useful
- * for printing meaningful error messages.
- */
-CURL_EXTERN const char *curl_share_strerror(CURLSHcode);
-
-/*
- * NAME curl_easy_pause()
- *
- * DESCRIPTION
- *
- * The curl_easy_pause function pauses or unpauses transfers. Select the new
- * state by setting the bitmask, use the convenience defines below.
- *
- */
-CURL_EXTERN CURLcode curl_easy_pause(CURL *handle, int bitmask);
-
-#define CURLPAUSE_RECV      (1<<0)
-#define CURLPAUSE_RECV_CONT (0)
-
-#define CURLPAUSE_SEND      (1<<2)
-#define CURLPAUSE_SEND_CONT (0)
-
-#define CURLPAUSE_ALL       (CURLPAUSE_RECV|CURLPAUSE_SEND)
-#define CURLPAUSE_CONT      (CURLPAUSE_RECV_CONT|CURLPAUSE_SEND_CONT)
-
-#ifdef  __cplusplus
-}
-#endif
-
-/* unfortunately, the easy.h and multi.h include files need options and info
-  stuff before they can be included! */
-#include "easy.h" /* nothing in curl is fun without the easy stuff */
-#include "multi.h"
-
-/* the typechecker doesn't work in C++ (yet) */
-#if defined(__GNUC__) && defined(__GNUC_MINOR__) && \
-    ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 3)) && \
-    !defined(__cplusplus) && !defined(CURL_DISABLE_TYPECHECK)
-#include "typecheck-gcc.h"
-#else
-#if defined(__STDC__) && (__STDC__ >= 1)
-/* This preprocessor magic that replaces a call with the exact same call is
-   only done to make sure application authors pass exactly three arguments
-   to these functions. */
-#define curl_easy_setopt(handle,opt,param) curl_easy_setopt(handle,opt,param)
-#define curl_easy_getinfo(handle,info,arg) curl_easy_getinfo(handle,info,arg)
-#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param)
-#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param)
-#endif /* __STDC__ >= 1 */
-#endif /* gcc >= 4.3 && !__cplusplus */
-
-#endif /* __CURL_CURL_H */
diff --git a/common/curl/curlbuild.h b/common/curl/curlbuild.h
deleted file mode 100644
index d0b32acb..00000000
--- a/common/curl/curlbuild.h
+++ /dev/null
@@ -1,583 +0,0 @@
-#ifndef __CURL_CURLBUILD_H
-#define __CURL_CURLBUILD_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-
-/* ================================================================ */
-/*               NOTES FOR CONFIGURE CAPABLE SYSTEMS                */
-/* ================================================================ */
-
-/*
- * NOTE 1:
- * -------
- *
- * See file include/curl/curlbuild.h.in, run configure, and forget
- * that this file exists it is only used for non-configure systems.
- * But you can keep reading if you want ;-)
- *
- */
-
-/* ================================================================ */
-/*                 NOTES FOR NON-CONFIGURE SYSTEMS                  */
-/* ================================================================ */
-
-/*
- * NOTE 1:
- * -------
- *
- * Nothing in this file is intended to be modified or adjusted by the
- * curl library user nor by the curl library builder.
- *
- * If you think that something actually needs to be changed, adjusted
- * or fixed in this file, then, report it on the libcurl development
- * mailing list: http://cool.haxx.se/mailman/listinfo/curl-library/
- *
- * Try to keep one section per platform, compiler and architecture,
- * otherwise, if an existing section is reused for a different one and
- * later on the original is adjusted, probably the piggybacking one can
- * be adversely changed.
- *
- * In order to differentiate between platforms/compilers/architectures
- * use only compiler built in predefined preprocessor symbols.
- *
- * This header file shall only export symbols which are 'curl' or 'CURL'
- * prefixed, otherwise public name space would be polluted.
- *
- * NOTE 2:
- * -------
- *
- * For any given platform/compiler curl_off_t must be typedef'ed to a
- * 64-bit wide signed integral data type. The width of this data type
- * must remain constant and independent of any possible large file
- * support settings.
- *
- * As an exception to the above, curl_off_t shall be typedef'ed to a
- * 32-bit wide signed integral data type if there is no 64-bit type.
- *
- * As a general rule, curl_off_t shall not be mapped to off_t. This
- * rule shall only be violated if off_t is the only 64-bit data type
- * available and the size of off_t is independent of large file support
- * settings. Keep your build on the safe side avoiding an off_t gating.
- * If you have a 64-bit off_t then take for sure that another 64-bit
- * data type exists, dig deeper and you will find it.
- *
- * NOTE 3:
- * -------
- *
- * Right now you might be staring at file include/curl/curlbuild.h.dist or
- * at file include/curl/curlbuild.h, this is due to the following reason:
- * file include/curl/curlbuild.h.dist is renamed to include/curl/curlbuild.h
- * when the libcurl source code distribution archive file is created.
- *
- * File include/curl/curlbuild.h.dist is not included in the distribution
- * archive. File include/curl/curlbuild.h is not present in the git tree.
- *
- * The distributed include/curl/curlbuild.h file is only intended to be used
- * on systems which can not run the also distributed configure script.
- *
- * On systems capable of running the configure script, the configure process
- * will overwrite the distributed include/curl/curlbuild.h file with one that
- * is suitable and specific to the library being configured and built, which
- * is generated from the include/curl/curlbuild.h.in template file.
- *
- * If you check out from git on a non-configure platform, you must run the
- * appropriate buildconf* script to set up curlbuild.h and other local files.
- *
- */
-
-/* ================================================================ */
-/*  DEFINITION OF THESE SYMBOLS SHALL NOT TAKE PLACE ANYWHERE ELSE  */
-/* ================================================================ */
-
-#ifdef CURL_SIZEOF_LONG
-#  error "CURL_SIZEOF_LONG shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_SIZEOF_LONG_already_defined
-#endif
-
-#ifdef CURL_TYPEOF_CURL_SOCKLEN_T
-#  error "CURL_TYPEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_already_defined
-#endif
-
-#ifdef CURL_SIZEOF_CURL_SOCKLEN_T
-#  error "CURL_SIZEOF_CURL_SOCKLEN_T shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_already_defined
-#endif
-
-#ifdef CURL_TYPEOF_CURL_OFF_T
-#  error "CURL_TYPEOF_CURL_OFF_T shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_already_defined
-#endif
-
-#ifdef CURL_FORMAT_CURL_OFF_T
-#  error "CURL_FORMAT_CURL_OFF_T shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_already_defined
-#endif
-
-#ifdef CURL_FORMAT_CURL_OFF_TU
-#  error "CURL_FORMAT_CURL_OFF_TU shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_already_defined
-#endif
-
-#ifdef CURL_FORMAT_OFF_T
-#  error "CURL_FORMAT_OFF_T shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_FORMAT_OFF_T_already_defined
-#endif
-
-#ifdef CURL_SIZEOF_CURL_OFF_T
-#  error "CURL_SIZEOF_CURL_OFF_T shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_already_defined
-#endif
-
-#ifdef CURL_SUFFIX_CURL_OFF_T
-#  error "CURL_SUFFIX_CURL_OFF_T shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_already_defined
-#endif
-
-#ifdef CURL_SUFFIX_CURL_OFF_TU
-#  error "CURL_SUFFIX_CURL_OFF_TU shall not be defined except in curlbuild.h"
-   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_already_defined
-#endif
-
-/* ================================================================ */
-/*    EXTERNAL INTERFACE SETTINGS FOR NON-CONFIGURE SYSTEMS ONLY    */
-/* ================================================================ */
-
-#if defined(__DJGPP__) || defined(__GO32__)
-#  if defined(__DJGPP__) && (__DJGPP__ > 1)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long long
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  else
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long
-#    define CURL_FORMAT_CURL_OFF_T     "ld"
-#    define CURL_FORMAT_CURL_OFF_TU    "lu"
-#    define CURL_FORMAT_OFF_T          "%ld"
-#    define CURL_SIZEOF_CURL_OFF_T     4
-#    define CURL_SUFFIX_CURL_OFF_T     L
-#    define CURL_SUFFIX_CURL_OFF_TU    UL
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__SALFORDC__)
-#  define CURL_SIZEOF_LONG           4
-#  define CURL_TYPEOF_CURL_OFF_T     long
-#  define CURL_FORMAT_CURL_OFF_T     "ld"
-#  define CURL_FORMAT_CURL_OFF_TU    "lu"
-#  define CURL_FORMAT_OFF_T          "%ld"
-#  define CURL_SIZEOF_CURL_OFF_T     4
-#  define CURL_SUFFIX_CURL_OFF_T     L
-#  define CURL_SUFFIX_CURL_OFF_TU    UL
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__BORLANDC__)
-#  if (__BORLANDC__ < 0x520)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long
-#    define CURL_FORMAT_CURL_OFF_T     "ld"
-#    define CURL_FORMAT_CURL_OFF_TU    "lu"
-#    define CURL_FORMAT_OFF_T          "%ld"
-#    define CURL_SIZEOF_CURL_OFF_T     4
-#    define CURL_SUFFIX_CURL_OFF_T     L
-#    define CURL_SUFFIX_CURL_OFF_TU    UL
-#  else
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     __int64
-#    define CURL_FORMAT_CURL_OFF_T     "I64d"
-#    define CURL_FORMAT_CURL_OFF_TU    "I64u"
-#    define CURL_FORMAT_OFF_T          "%I64d"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     i64
-#    define CURL_SUFFIX_CURL_OFF_TU    ui64
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__TURBOC__)
-#  define CURL_SIZEOF_LONG           4
-#  define CURL_TYPEOF_CURL_OFF_T     long
-#  define CURL_FORMAT_CURL_OFF_T     "ld"
-#  define CURL_FORMAT_CURL_OFF_TU    "lu"
-#  define CURL_FORMAT_OFF_T          "%ld"
-#  define CURL_SIZEOF_CURL_OFF_T     4
-#  define CURL_SUFFIX_CURL_OFF_T     L
-#  define CURL_SUFFIX_CURL_OFF_TU    UL
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__WATCOMC__)
-#  if defined(__386__)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     __int64
-#    define CURL_FORMAT_CURL_OFF_T     "I64d"
-#    define CURL_FORMAT_CURL_OFF_TU    "I64u"
-#    define CURL_FORMAT_OFF_T          "%I64d"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     i64
-#    define CURL_SUFFIX_CURL_OFF_TU    ui64
-#  else
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long
-#    define CURL_FORMAT_CURL_OFF_T     "ld"
-#    define CURL_FORMAT_CURL_OFF_TU    "lu"
-#    define CURL_FORMAT_OFF_T          "%ld"
-#    define CURL_SIZEOF_CURL_OFF_T     4
-#    define CURL_SUFFIX_CURL_OFF_T     L
-#    define CURL_SUFFIX_CURL_OFF_TU    UL
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__POCC__)
-#  if (__POCC__ < 280)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long
-#    define CURL_FORMAT_CURL_OFF_T     "ld"
-#    define CURL_FORMAT_CURL_OFF_TU    "lu"
-#    define CURL_FORMAT_OFF_T          "%ld"
-#    define CURL_SIZEOF_CURL_OFF_T     4
-#    define CURL_SUFFIX_CURL_OFF_T     L
-#    define CURL_SUFFIX_CURL_OFF_TU    UL
-#  elif defined(_MSC_VER)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     __int64
-#    define CURL_FORMAT_CURL_OFF_T     "I64d"
-#    define CURL_FORMAT_CURL_OFF_TU    "I64u"
-#    define CURL_FORMAT_OFF_T          "%I64d"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     i64
-#    define CURL_SUFFIX_CURL_OFF_TU    ui64
-#  else
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long long
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__LCC__)
-#  define CURL_SIZEOF_LONG           4
-#  define CURL_TYPEOF_CURL_OFF_T     long
-#  define CURL_FORMAT_CURL_OFF_T     "ld"
-#  define CURL_FORMAT_CURL_OFF_TU    "lu"
-#  define CURL_FORMAT_OFF_T          "%ld"
-#  define CURL_SIZEOF_CURL_OFF_T     4
-#  define CURL_SUFFIX_CURL_OFF_T     L
-#  define CURL_SUFFIX_CURL_OFF_TU    UL
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__SYMBIAN32__)
-#  if defined(__EABI__)  /* Treat all ARM compilers equally */
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long long
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  elif defined(__CW32__)
-#    pragma longlong on
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long long
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  elif defined(__VC32__)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     __int64
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T unsigned int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__MWERKS__)
-#  define CURL_SIZEOF_LONG           4
-#  define CURL_TYPEOF_CURL_OFF_T     long long
-#  define CURL_FORMAT_CURL_OFF_T     "lld"
-#  define CURL_FORMAT_CURL_OFF_TU    "llu"
-#  define CURL_FORMAT_OFF_T          "%lld"
-#  define CURL_SIZEOF_CURL_OFF_T     8
-#  define CURL_SUFFIX_CURL_OFF_T     LL
-#  define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(_WIN32_WCE)
-#  define CURL_SIZEOF_LONG           4
-#  define CURL_TYPEOF_CURL_OFF_T     __int64
-#  define CURL_FORMAT_CURL_OFF_T     "I64d"
-#  define CURL_FORMAT_CURL_OFF_TU    "I64u"
-#  define CURL_FORMAT_OFF_T          "%I64d"
-#  define CURL_SIZEOF_CURL_OFF_T     8
-#  define CURL_SUFFIX_CURL_OFF_T     i64
-#  define CURL_SUFFIX_CURL_OFF_TU    ui64
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__MINGW32__)
-#  define CURL_SIZEOF_LONG           4
-#  define CURL_TYPEOF_CURL_OFF_T     long long
-#  define CURL_FORMAT_CURL_OFF_T     "I64d"
-#  define CURL_FORMAT_CURL_OFF_TU    "I64u"
-#  define CURL_FORMAT_OFF_T          "%I64d"
-#  define CURL_SIZEOF_CURL_OFF_T     8
-#  define CURL_SUFFIX_CURL_OFF_T     LL
-#  define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__VMS)
-#  if defined(__VAX)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long
-#    define CURL_FORMAT_CURL_OFF_T     "ld"
-#    define CURL_FORMAT_CURL_OFF_TU    "lu"
-#    define CURL_FORMAT_OFF_T          "%ld"
-#    define CURL_SIZEOF_CURL_OFF_T     4
-#    define CURL_SUFFIX_CURL_OFF_T     L
-#    define CURL_SUFFIX_CURL_OFF_TU    UL
-#  else
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long long
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T unsigned int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-#elif defined(__OS400__)
-#  if defined(__ILEC400__)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long long
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#    define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t
-#    define CURL_SIZEOF_CURL_SOCKLEN_T 4
-#    define CURL_PULL_SYS_TYPES_H      1
-#    define CURL_PULL_SYS_SOCKET_H     1
-#  endif
-
-#elif defined(__MVS__)
-#  if defined(__IBMC__) || defined(__IBMCPP__)
-#    if defined(_ILP32)
-#      define CURL_SIZEOF_LONG           4
-#    elif defined(_LP64)
-#      define CURL_SIZEOF_LONG           8
-#    endif
-#    if defined(_LONG_LONG)
-#      define CURL_TYPEOF_CURL_OFF_T     long long
-#      define CURL_FORMAT_CURL_OFF_T     "lld"
-#      define CURL_FORMAT_CURL_OFF_TU    "llu"
-#      define CURL_FORMAT_OFF_T          "%lld"
-#      define CURL_SIZEOF_CURL_OFF_T     8
-#      define CURL_SUFFIX_CURL_OFF_T     LL
-#      define CURL_SUFFIX_CURL_OFF_TU    ULL
-#    elif defined(_LP64)
-#      define CURL_TYPEOF_CURL_OFF_T     long
-#      define CURL_FORMAT_CURL_OFF_T     "ld"
-#      define CURL_FORMAT_CURL_OFF_TU    "lu"
-#      define CURL_FORMAT_OFF_T          "%ld"
-#      define CURL_SIZEOF_CURL_OFF_T     8
-#      define CURL_SUFFIX_CURL_OFF_T     L
-#      define CURL_SUFFIX_CURL_OFF_TU    UL
-#    else
-#      define CURL_TYPEOF_CURL_OFF_T     long
-#      define CURL_FORMAT_CURL_OFF_T     "ld"
-#      define CURL_FORMAT_CURL_OFF_TU    "lu"
-#      define CURL_FORMAT_OFF_T          "%ld"
-#      define CURL_SIZEOF_CURL_OFF_T     4
-#      define CURL_SUFFIX_CURL_OFF_T     L
-#      define CURL_SUFFIX_CURL_OFF_TU    UL
-#    endif
-#    define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t
-#    define CURL_SIZEOF_CURL_SOCKLEN_T 4
-#    define CURL_PULL_SYS_TYPES_H      1
-#    define CURL_PULL_SYS_SOCKET_H     1
-#  endif
-
-#elif defined(__370__)
-#  if defined(__IBMC__) || defined(__IBMCPP__)
-#    if defined(_ILP32)
-#      define CURL_SIZEOF_LONG           4
-#    elif defined(_LP64)
-#      define CURL_SIZEOF_LONG           8
-#    endif
-#    if defined(_LONG_LONG)
-#      define CURL_TYPEOF_CURL_OFF_T     long long
-#      define CURL_FORMAT_CURL_OFF_T     "lld"
-#      define CURL_FORMAT_CURL_OFF_TU    "llu"
-#      define CURL_FORMAT_OFF_T          "%lld"
-#      define CURL_SIZEOF_CURL_OFF_T     8
-#      define CURL_SUFFIX_CURL_OFF_T     LL
-#      define CURL_SUFFIX_CURL_OFF_TU    ULL
-#    elif defined(_LP64)
-#      define CURL_TYPEOF_CURL_OFF_T     long
-#      define CURL_FORMAT_CURL_OFF_T     "ld"
-#      define CURL_FORMAT_CURL_OFF_TU    "lu"
-#      define CURL_FORMAT_OFF_T          "%ld"
-#      define CURL_SIZEOF_CURL_OFF_T     8
-#      define CURL_SUFFIX_CURL_OFF_T     L
-#      define CURL_SUFFIX_CURL_OFF_TU    UL
-#    else
-#      define CURL_TYPEOF_CURL_OFF_T     long
-#      define CURL_FORMAT_CURL_OFF_T     "ld"
-#      define CURL_FORMAT_CURL_OFF_TU    "lu"
-#      define CURL_FORMAT_OFF_T          "%ld"
-#      define CURL_SIZEOF_CURL_OFF_T     4
-#      define CURL_SUFFIX_CURL_OFF_T     L
-#      define CURL_SUFFIX_CURL_OFF_TU    UL
-#    endif
-#    define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t
-#    define CURL_SIZEOF_CURL_SOCKLEN_T 4
-#    define CURL_PULL_SYS_TYPES_H      1
-#    define CURL_PULL_SYS_SOCKET_H     1
-#  endif
-
-#elif defined(TPF)
-#  define CURL_SIZEOF_LONG           8
-#  define CURL_TYPEOF_CURL_OFF_T     long
-#  define CURL_FORMAT_CURL_OFF_T     "ld"
-#  define CURL_FORMAT_CURL_OFF_TU    "lu"
-#  define CURL_FORMAT_OFF_T          "%ld"
-#  define CURL_SIZEOF_CURL_OFF_T     8
-#  define CURL_SUFFIX_CURL_OFF_T     L
-#  define CURL_SUFFIX_CURL_OFF_TU    UL
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-/* ===================================== */
-/*    KEEP MSVC THE PENULTIMATE ENTRY    */
-/* ===================================== */
-
-#elif defined(_MSC_VER)
-#  if (_MSC_VER >= 900) && (_INTEGRAL_MAX_BITS >= 64)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     __int64
-#    define CURL_FORMAT_CURL_OFF_T     "I64d"
-#    define CURL_FORMAT_CURL_OFF_TU    "I64u"
-#    define CURL_FORMAT_OFF_T          "%I64d"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     i64
-#    define CURL_SUFFIX_CURL_OFF_TU    ui64
-#  else
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long
-#    define CURL_FORMAT_CURL_OFF_T     "ld"
-#    define CURL_FORMAT_CURL_OFF_TU    "lu"
-#    define CURL_FORMAT_OFF_T          "%ld"
-#    define CURL_SIZEOF_CURL_OFF_T     4
-#    define CURL_SUFFIX_CURL_OFF_T     L
-#    define CURL_SUFFIX_CURL_OFF_TU    UL
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T int
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-
-/* ===================================== */
-/*    KEEP GENERIC GCC THE LAST ENTRY    */
-/* ===================================== */
-
-#elif defined(__GNUC__)
-#  if defined(__i386__) || defined(__ppc__)
-#    define CURL_SIZEOF_LONG           4
-#    define CURL_TYPEOF_CURL_OFF_T     long long
-#    define CURL_FORMAT_CURL_OFF_T     "lld"
-#    define CURL_FORMAT_CURL_OFF_TU    "llu"
-#    define CURL_FORMAT_OFF_T          "%lld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     LL
-#    define CURL_SUFFIX_CURL_OFF_TU    ULL
-#  elif defined(__x86_64__) || defined(__ppc64__)
-#    define CURL_SIZEOF_LONG           8
-#    define CURL_TYPEOF_CURL_OFF_T     long
-#    define CURL_FORMAT_CURL_OFF_T     "ld"
-#    define CURL_FORMAT_CURL_OFF_TU    "lu"
-#    define CURL_FORMAT_OFF_T          "%ld"
-#    define CURL_SIZEOF_CURL_OFF_T     8
-#    define CURL_SUFFIX_CURL_OFF_T     L
-#    define CURL_SUFFIX_CURL_OFF_TU    UL
-#  endif
-#  define CURL_TYPEOF_CURL_SOCKLEN_T socklen_t
-#  define CURL_SIZEOF_CURL_SOCKLEN_T 4
-#  define CURL_PULL_SYS_TYPES_H      1
-#  define CURL_PULL_SYS_SOCKET_H     1
-
-#else
-#  error "Unknown non-configure build target!"
-   Error Compilation_aborted_Unknown_non_configure_build_target
-#endif
-
-/* CURL_PULL_SYS_TYPES_H is defined above when inclusion of header file  */
-/* sys/types.h is required here to properly make type definitions below. */
-#ifdef CURL_PULL_SYS_TYPES_H
-#  include <sys/types.h>
-#endif
-
-/* CURL_PULL_SYS_SOCKET_H is defined above when inclusion of header file  */
-/* sys/socket.h is required here to properly make type definitions below. */
-#ifdef CURL_PULL_SYS_SOCKET_H
-#  include <sys/socket.h>
-#endif
-
-/* Data type definition of curl_socklen_t. */
-
-#ifdef CURL_TYPEOF_CURL_SOCKLEN_T
-  typedef CURL_TYPEOF_CURL_SOCKLEN_T curl_socklen_t;
-#endif
-
-/* Data type definition of curl_off_t. */
-
-#ifdef CURL_TYPEOF_CURL_OFF_T
-  typedef CURL_TYPEOF_CURL_OFF_T curl_off_t;
-#endif
-
-#endif /* __CURL_CURLBUILD_H */
diff --git a/common/curl/curlrules.h b/common/curl/curlrules.h
deleted file mode 100644
index 8aad1df6..00000000
--- a/common/curl/curlrules.h
+++ /dev/null
@@ -1,252 +0,0 @@
-#ifndef __CURL_CURLRULES_H
-#define __CURL_CURLRULES_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-
-/* ================================================================ */
-/*                    COMPILE TIME SANITY CHECKS                    */
-/* ================================================================ */
-
-/*
- * NOTE 1:
- * -------
- *
- * All checks done in this file are intentionally placed in a public
- * header file which is pulled by curl/curl.h when an application is
- * being built using an already built libcurl library. Additionally
- * this file is also included and used when building the library.
- *
- * If compilation fails on this file it is certainly sure that the
- * problem is elsewhere. It could be a problem in the curlbuild.h
- * header file, or simply that you are using different compilation
- * settings than those used to build the library.
- *
- * Nothing in this file is intended to be modified or adjusted by the
- * curl library user nor by the curl library builder.
- *
- * Do not deactivate any check, these are done to make sure that the
- * library is properly built and used.
- *
- * You can find further help on the libcurl development mailing list:
- * http://cool.haxx.se/mailman/listinfo/curl-library/
- *
- * NOTE 2
- * ------
- *
- * Some of the following compile time checks are based on the fact
- * that the dimension of a constant array can not be a negative one.
- * In this way if the compile time verification fails, the compilation
- * will fail issuing an error. The error description wording is compiler
- * dependent but it will be quite similar to one of the following:
- *
- *   "negative subscript or subscript is too large"
- *   "array must have at least one element"
- *   "-1 is an illegal array size"
- *   "size of array is negative"
- *
- * If you are building an application which tries to use an already
- * built libcurl library and you are getting this kind of errors on
- * this file, it is a clear indication that there is a mismatch between
- * how the library was built and how you are trying to use it for your
- * application. Your already compiled or binary library provider is the
- * only one who can give you the details you need to properly use it.
- */
-
-/*
- * Verify that some macros are actually defined.
- */
-
-#ifndef CURL_SIZEOF_LONG
-#  error "CURL_SIZEOF_LONG definition is missing!"
-   Error Compilation_aborted_CURL_SIZEOF_LONG_is_missing
-#endif
-
-#ifndef CURL_TYPEOF_CURL_SOCKLEN_T
-#  error "CURL_TYPEOF_CURL_SOCKLEN_T definition is missing!"
-   Error Compilation_aborted_CURL_TYPEOF_CURL_SOCKLEN_T_is_missing
-#endif
-
-#ifndef CURL_SIZEOF_CURL_SOCKLEN_T
-#  error "CURL_SIZEOF_CURL_SOCKLEN_T definition is missing!"
-   Error Compilation_aborted_CURL_SIZEOF_CURL_SOCKLEN_T_is_missing
-#endif
-
-#ifndef CURL_TYPEOF_CURL_OFF_T
-#  error "CURL_TYPEOF_CURL_OFF_T definition is missing!"
-   Error Compilation_aborted_CURL_TYPEOF_CURL_OFF_T_is_missing
-#endif
-
-#ifndef CURL_FORMAT_CURL_OFF_T
-#  error "CURL_FORMAT_CURL_OFF_T definition is missing!"
-   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_T_is_missing
-#endif
-
-#ifndef CURL_FORMAT_CURL_OFF_TU
-#  error "CURL_FORMAT_CURL_OFF_TU definition is missing!"
-   Error Compilation_aborted_CURL_FORMAT_CURL_OFF_TU_is_missing
-#endif
-
-#ifndef CURL_FORMAT_OFF_T
-#  error "CURL_FORMAT_OFF_T definition is missing!"
-   Error Compilation_aborted_CURL_FORMAT_OFF_T_is_missing
-#endif
-
-#ifndef CURL_SIZEOF_CURL_OFF_T
-#  error "CURL_SIZEOF_CURL_OFF_T definition is missing!"
-   Error Compilation_aborted_CURL_SIZEOF_CURL_OFF_T_is_missing
-#endif
-
-#ifndef CURL_SUFFIX_CURL_OFF_T
-#  error "CURL_SUFFIX_CURL_OFF_T definition is missing!"
-   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_T_is_missing
-#endif
-
-#ifndef CURL_SUFFIX_CURL_OFF_TU
-#  error "CURL_SUFFIX_CURL_OFF_TU definition is missing!"
-   Error Compilation_aborted_CURL_SUFFIX_CURL_OFF_TU_is_missing
-#endif
-
-/*
- * Macros private to this header file.
- */
-
-#define CurlchkszEQ(t, s) sizeof(t) == s ? 1 : -1
-
-#define CurlchkszGE(t1, t2) sizeof(t1) >= sizeof(t2) ? 1 : -1
-
-/*
- * Verify that the size previously defined and expected for long
- * is the same as the one reported by sizeof() at compile time.
- */
-
-typedef char
-  __curl_rule_01__
-    [CurlchkszEQ(long, CURL_SIZEOF_LONG)];
-
-/*
- * Verify that the size previously defined and expected for
- * curl_off_t is actually the the same as the one reported
- * by sizeof() at compile time.
- */
-
-typedef char
-  __curl_rule_02__
-    [CurlchkszEQ(curl_off_t, CURL_SIZEOF_CURL_OFF_T)];
-
-/*
- * Verify at compile time that the size of curl_off_t as reported
- * by sizeof() is greater or equal than the one reported for long
- * for the current compilation.
- */
-
-typedef char
-  __curl_rule_03__
-    [CurlchkszGE(curl_off_t, long)];
-
-/*
- * Verify that the size previously defined and expected for
- * curl_socklen_t is actually the the same as the one reported
- * by sizeof() at compile time.
- */
-
-typedef char
-  __curl_rule_04__
-    [CurlchkszEQ(curl_socklen_t, CURL_SIZEOF_CURL_SOCKLEN_T)];
-
-/*
- * Verify at compile time that the size of curl_socklen_t as reported
- * by sizeof() is greater or equal than the one reported for int for
- * the current compilation.
- */
-
-typedef char
-  __curl_rule_05__
-    [CurlchkszGE(curl_socklen_t, int)];
-
-/* ================================================================ */
-/*          EXTERNALLY AND INTERNALLY VISIBLE DEFINITIONS           */
-/* ================================================================ */
-
-/*
- * CURL_ISOCPP and CURL_OFF_T_C definitions are done here in order to allow
- * these to be visible and exported by the external libcurl interface API,
- * while also making them visible to the library internals, simply including
- * setup.h, without actually needing to include curl.h internally.
- * If some day this section would grow big enough, all this should be moved
- * to its own header file.
- */
-
-/*
- * Figure out if we can use the ## preprocessor operator, which is supported
- * by ISO/ANSI C and C++. Some compilers support it without setting __STDC__
- * or  __cplusplus so we need to carefully check for them too.
- */
-
-#if defined(__STDC__) || defined(_MSC_VER) || defined(__cplusplus) || \
-  defined(__HP_aCC) || defined(__BORLANDC__) || defined(__LCC__) || \
-  defined(__POCC__) || defined(__SALFORDC__) || defined(__HIGHC__) || \
-  defined(__ILEC400__)
-  /* This compiler is believed to have an ISO compatible preprocessor */
-#define CURL_ISOCPP
-#else
-  /* This compiler is believed NOT to have an ISO compatible preprocessor */
-#undef CURL_ISOCPP
-#endif
-
-/*
- * Macros for minimum-width signed and unsigned curl_off_t integer constants.
- */
-
-#ifdef CURL_ISOCPP
-#  define __CURL_OFF_T_C_HELPER2(Val,Suffix) Val ## Suffix
-#else
-#  define __CURL_OFF_T_C_HELPER2(Val,Suffix) Val/**/Suffix
-#endif
-#define __CURL_OFF_T_C_HELPER1(Val,Suffix) __CURL_OFF_T_C_HELPER2(Val,Suffix)
-#define CURL_OFF_T_C(Val)  __CURL_OFF_T_C_HELPER1(Val,CURL_SUFFIX_CURL_OFF_T)
-#define CURL_OFF_TU_C(Val) __CURL_OFF_T_C_HELPER1(Val,CURL_SUFFIX_CURL_OFF_TU)
-
-/*
- * Get rid of macros private to this header file.
- */
-
-#undef CurlchkszEQ
-#undef CurlchkszGE
-
-/*
- * Get rid of macros not intended to exist beyond this point.
- */
-
-#undef CURL_PULL_WS2TCPIP_H
-#undef CURL_PULL_SYS_TYPES_H
-#undef CURL_PULL_SYS_SOCKET_H
-#undef CURL_PULL_STDINT_H
-#undef CURL_PULL_INTTYPES_H
-
-#undef CURL_TYPEOF_CURL_SOCKLEN_T
-#undef CURL_TYPEOF_CURL_OFF_T
-
-#ifdef CURL_NO_OLDIES
-#undef CURL_FORMAT_OFF_T /* not required since 7.19.0 - obsoleted in 7.20.0 */
-#endif
-
-#endif /* __CURL_CURLRULES_H */
diff --git a/common/curl/curlver.h b/common/curl/curlver.h
deleted file mode 100644
index e345f56d..00000000
--- a/common/curl/curlver.h
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef __CURL_CURLVER_H
-#define __CURL_CURLVER_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-
-/* This header file contains nothing but libcurl version info, generated by
-   a script at release-time. This was made its own header file in 7.11.2 */
-
-/* This is the global package copyright */
-#define LIBCURL_COPYRIGHT "1996 - 2010 Daniel Stenberg, <daniel@haxx.se>."
-
-/* This is the version number of the libcurl package from which this header
-   file origins: */
-#define LIBCURL_VERSION "7.21.2"
-
-/* The numeric version number is also available "in parts" by using these
-   defines: */
-#define LIBCURL_VERSION_MAJOR 7
-#define LIBCURL_VERSION_MINOR 21
-#define LIBCURL_VERSION_PATCH 2
-
-/* This is the numeric version of the libcurl version number, meant for easier
-   parsing and comparions by programs. The LIBCURL_VERSION_NUM define will
-   always follow this syntax:
-
-         0xXXYYZZ
-
-   Where XX, YY and ZZ are the main version, release and patch numbers in
-   hexadecimal (using 8 bits each). All three numbers are always represented
-   using two digits.  1.2 would appear as "0x010200" while version 9.11.7
-   appears as "0x090b07".
-
-   This 6-digit (24 bits) hexadecimal number does not show pre-release number,
-   and it is always a greater number in a more recent release. It makes
-   comparisons with greater than and less than work.
-*/
-#define LIBCURL_VERSION_NUM 0x071502
-
-/*
- * This is the date and time when the full source package was created. The
- * timestamp is not stored in git, as the timestamp is properly set in the
- * tarballs by the maketgz script.
- *
- * The format of the date should follow this template:
- *
- * "Mon Feb 12 11:35:33 UTC 2007"
- */
-#define LIBCURL_TIMESTAMP "Tue Oct 12 22:03:31 UTC 2010"
-
-#endif /* __CURL_CURLVER_H */
diff --git a/common/curl/easy.h b/common/curl/easy.h
deleted file mode 100644
index 1ddb4fe5..00000000
--- a/common/curl/easy.h
+++ /dev/null
@@ -1,102 +0,0 @@
-#ifndef __CURL_EASY_H
-#define __CURL_EASY_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2008, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-CURL_EXTERN CURL *curl_easy_init(void);
-CURL_EXTERN CURLcode curl_easy_setopt(CURL *curl, CURLoption option, ...);
-CURL_EXTERN CURLcode curl_easy_perform(CURL *curl);
-CURL_EXTERN void curl_easy_cleanup(CURL *curl);
-
-/*
- * NAME curl_easy_getinfo()
- *
- * DESCRIPTION
- *
- * Request internal information from the curl session with this function.  The
- * third argument MUST be a pointer to a long, a pointer to a char * or a
- * pointer to a double (as the documentation describes elsewhere).  The data
- * pointed to will be filled in accordingly and can be relied upon only if the
- * function returns CURLE_OK.  This function is intended to get used *AFTER* a
- * performed transfer, all results from this function are undefined until the
- * transfer is completed.
- */
-CURL_EXTERN CURLcode curl_easy_getinfo(CURL *curl, CURLINFO info, ...);
-
-
-/*
- * NAME curl_easy_duphandle()
- *
- * DESCRIPTION
- *
- * Creates a new curl session handle with the same options set for the handle
- * passed in. Duplicating a handle could only be a matter of cloning data and
- * options, internal state info and things like persistant connections cannot
- * be transfered. It is useful in multithreaded applications when you can run
- * curl_easy_duphandle() for each new thread to avoid a series of identical
- * curl_easy_setopt() invokes in every thread.
- */
-CURL_EXTERN CURL* curl_easy_duphandle(CURL *curl);
-
-/*
- * NAME curl_easy_reset()
- *
- * DESCRIPTION
- *
- * Re-initializes a CURL handle to the default values. This puts back the
- * handle to the same state as it was in when it was just created.
- *
- * It does keep: live connections, the Session ID cache, the DNS cache and the
- * cookies.
- */
-CURL_EXTERN void curl_easy_reset(CURL *curl);
-
-/*
- * NAME curl_easy_recv()
- *
- * DESCRIPTION
- *
- * Receives data from the connected socket. Use after successful
- * curl_easy_perform() with CURLOPT_CONNECT_ONLY option.
- */
-CURL_EXTERN CURLcode curl_easy_recv(CURL *curl, void *buffer, size_t buflen,
-                                    size_t *n);
-
-/*
- * NAME curl_easy_send()
- *
- * DESCRIPTION
- *
- * Sends data over the connected socket. Use after successful
- * curl_easy_perform() with CURLOPT_CONNECT_ONLY option.
- */
-CURL_EXTERN CURLcode curl_easy_send(CURL *curl, const void *buffer,
-                                    size_t buflen, size_t *n);
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif
diff --git a/common/curl/mprintf.h b/common/curl/mprintf.h
deleted file mode 100644
index de7dd2f3..00000000
--- a/common/curl/mprintf.h
+++ /dev/null
@@ -1,81 +0,0 @@
-#ifndef __CURL_MPRINTF_H
-#define __CURL_MPRINTF_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2006, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-
-#include <stdarg.h>
-#include <stdio.h> /* needed for FILE */
-
-#include "curl.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-CURL_EXTERN int curl_mprintf(const char *format, ...);
-CURL_EXTERN int curl_mfprintf(FILE *fd, const char *format, ...);
-CURL_EXTERN int curl_msprintf(char *buffer, const char *format, ...);
-CURL_EXTERN int curl_msnprintf(char *buffer, size_t maxlength,
-                               const char *format, ...);
-CURL_EXTERN int curl_mvprintf(const char *format, va_list args);
-CURL_EXTERN int curl_mvfprintf(FILE *fd, const char *format, va_list args);
-CURL_EXTERN int curl_mvsprintf(char *buffer, const char *format, va_list args);
-CURL_EXTERN int curl_mvsnprintf(char *buffer, size_t maxlength,
-                                const char *format, va_list args);
-CURL_EXTERN char *curl_maprintf(const char *format, ...);
-CURL_EXTERN char *curl_mvaprintf(const char *format, va_list args);
-
-#ifdef _MPRINTF_REPLACE
-# undef printf
-# undef fprintf
-# undef sprintf
-# undef vsprintf
-# undef snprintf
-# undef vprintf
-# undef vfprintf
-# undef vsnprintf
-# undef aprintf
-# undef vaprintf
-# define printf curl_mprintf
-# define fprintf curl_mfprintf
-#ifdef CURLDEBUG
-/* When built with CURLDEBUG we define away the sprintf() functions since we
-   don't want internal code to be using them */
-# define sprintf sprintf_was_used
-# define vsprintf vsprintf_was_used
-#else
-# define sprintf curl_msprintf
-# define vsprintf curl_mvsprintf
-#endif
-# define snprintf curl_msnprintf
-# define vprintf curl_mvprintf
-# define vfprintf curl_mvfprintf
-# define vsnprintf curl_mvsnprintf
-# define aprintf curl_maprintf
-# define vaprintf curl_mvaprintf
-#endif
-
-#ifdef  __cplusplus
-}
-#endif
-
-#endif /* __CURL_MPRINTF_H */
diff --git a/common/curl/multi.h b/common/curl/multi.h
deleted file mode 100644
index f9656666..00000000
--- a/common/curl/multi.h
+++ /dev/null
@@ -1,345 +0,0 @@
-#ifndef __CURL_MULTI_H
-#define __CURL_MULTI_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2007, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-/*
-  This is an "external" header file. Don't give away any internals here!
-
-  GOALS
-
-  o Enable a "pull" interface. The application that uses libcurl decides where
-    and when to ask libcurl to get/send data.
-
-  o Enable multiple simultaneous transfers in the same thread without making it
-    complicated for the application.
-
-  o Enable the application to select() on its own file descriptors and curl's
-    file descriptors simultaneous easily.
-
-*/
-
-/*
- * This header file should not really need to include "curl.h" since curl.h
- * itself includes this file and we expect user applications to do #include
- * <curl/curl.h> without the need for especially including multi.h.
- *
- * For some reason we added this include here at one point, and rather than to
- * break existing (wrongly written) libcurl applications, we leave it as-is
- * but with this warning attached.
- */
-#include "curl.h"
-
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
-typedef void CURLM;
-
-typedef enum {
-  CURLM_CALL_MULTI_PERFORM = -1, /* please call curl_multi_perform() or
-                                    curl_multi_socket*() soon */
-  CURLM_OK,
-  CURLM_BAD_HANDLE,      /* the passed-in handle is not a valid CURLM handle */
-  CURLM_BAD_EASY_HANDLE, /* an easy handle was not good/valid */
-  CURLM_OUT_OF_MEMORY,   /* if you ever get this, you're in deep sh*t */
-  CURLM_INTERNAL_ERROR,  /* this is a libcurl bug */
-  CURLM_BAD_SOCKET,      /* the passed in socket argument did not match */
-  CURLM_UNKNOWN_OPTION,  /* curl_multi_setopt() with unsupported option */
-  CURLM_LAST
-} CURLMcode;
-
-/* just to make code nicer when using curl_multi_socket() you can now check
-   for CURLM_CALL_MULTI_SOCKET too in the same style it works for
-   curl_multi_perform() and CURLM_CALL_MULTI_PERFORM */
-#define CURLM_CALL_MULTI_SOCKET CURLM_CALL_MULTI_PERFORM
-
-typedef enum {
-  CURLMSG_NONE, /* first, not used */
-  CURLMSG_DONE, /* This easy handle has completed. 'result' contains
-                   the CURLcode of the transfer */
-  CURLMSG_LAST /* last, not used */
-} CURLMSG;
-
-struct CURLMsg {
-  CURLMSG msg;       /* what this message means */
-  CURL *easy_handle; /* the handle it concerns */
-  union {
-    void *whatever;    /* message-specific data */
-    CURLcode result;   /* return code for transfer */
-  } data;
-};
-typedef struct CURLMsg CURLMsg;
-
-/*
- * Name:    curl_multi_init()
- *
- * Desc:    inititalize multi-style curl usage
- *
- * Returns: a new CURLM handle to use in all 'curl_multi' functions.
- */
-CURL_EXTERN CURLM *curl_multi_init(void);
-
-/*
- * Name:    curl_multi_add_handle()
- *
- * Desc:    add a standard curl handle to the multi stack
- *
- * Returns: CURLMcode type, general multi error code.
- */
-CURL_EXTERN CURLMcode curl_multi_add_handle(CURLM *multi_handle,
-                                            CURL *curl_handle);
-
- /*
-  * Name:    curl_multi_remove_handle()
-  *
-  * Desc:    removes a curl handle from the multi stack again
-  *
-  * Returns: CURLMcode type, general multi error code.
-  */
-CURL_EXTERN CURLMcode curl_multi_remove_handle(CURLM *multi_handle,
-                                               CURL *curl_handle);
-
- /*
-  * Name:    curl_multi_fdset()
-  *
-  * Desc:    Ask curl for its fd_set sets. The app can use these to select() or
-  *          poll() on. We want curl_multi_perform() called as soon as one of
-  *          them are ready.
-  *
-  * Returns: CURLMcode type, general multi error code.
-  */
-CURL_EXTERN CURLMcode curl_multi_fdset(CURLM *multi_handle,
-                                       fd_set *read_fd_set,
-                                       fd_set *write_fd_set,
-                                       fd_set *exc_fd_set,
-                                       int *max_fd);
-
- /*
-  * Name:    curl_multi_perform()
-  *
-  * Desc:    When the app thinks there's data available for curl it calls this
-  *          function to read/write whatever there is right now. This returns
-  *          as soon as the reads and writes are done. This function does not
-  *          require that there actually is data available for reading or that
-  *          data can be written, it can be called just in case. It returns
-  *          the number of handles that still transfer data in the second
-  *          argument's integer-pointer.
-  *
-  * Returns: CURLMcode type, general multi error code. *NOTE* that this only
-  *          returns errors etc regarding the whole multi stack. There might
-  *          still have occurred problems on invidual transfers even when this
-  *          returns OK.
-  */
-CURL_EXTERN CURLMcode curl_multi_perform(CURLM *multi_handle,
-                                         int *running_handles);
-
- /*
-  * Name:    curl_multi_cleanup()
-  *
-  * Desc:    Cleans up and removes a whole multi stack. It does not free or
-  *          touch any individual easy handles in any way. We need to define
-  *          in what state those handles will be if this function is called
-  *          in the middle of a transfer.
-  *
-  * Returns: CURLMcode type, general multi error code.
-  */
-CURL_EXTERN CURLMcode curl_multi_cleanup(CURLM *multi_handle);
-
-/*
- * Name:    curl_multi_info_read()
- *
- * Desc:    Ask the multi handle if there's any messages/informationals from
- *          the individual transfers. Messages include informationals such as
- *          error code from the transfer or just the fact that a transfer is
- *          completed. More details on these should be written down as well.
- *
- *          Repeated calls to this function will return a new struct each
- *          time, until a special "end of msgs" struct is returned as a signal
- *          that there is no more to get at this point.
- *
- *          The data the returned pointer points to will not survive calling
- *          curl_multi_cleanup().
- *
- *          The 'CURLMsg' struct is meant to be very simple and only contain
- *          very basic informations. If more involved information is wanted,
- *          we will provide the particular "transfer handle" in that struct
- *          and that should/could/would be used in subsequent
- *          curl_easy_getinfo() calls (or similar). The point being that we
- *          must never expose complex structs to applications, as then we'll
- *          undoubtably get backwards compatibility problems in the future.
- *
- * Returns: A pointer to a filled-in struct, or NULL if it failed or ran out
- *          of structs. It also writes the number of messages left in the
- *          queue (after this read) in the integer the second argument points
- *          to.
- */
-CURL_EXTERN CURLMsg *curl_multi_info_read(CURLM *multi_handle,
-                                          int *msgs_in_queue);
-
-/*
- * Name:    curl_multi_strerror()
- *
- * Desc:    The curl_multi_strerror function may be used to turn a CURLMcode
- *          value into the equivalent human readable error string.  This is
- *          useful for printing meaningful error messages.
- *
- * Returns: A pointer to a zero-terminated error message.
- */
-CURL_EXTERN const char *curl_multi_strerror(CURLMcode);
-
-/*
- * Name:    curl_multi_socket() and
- *          curl_multi_socket_all()
- *
- * Desc:    An alternative version of curl_multi_perform() that allows the
- *          application to pass in one of the file descriptors that have been
- *          detected to have "action" on them and let libcurl perform.
- *          See man page for details.
- */
-#define CURL_POLL_NONE   0
-#define CURL_POLL_IN     1
-#define CURL_POLL_OUT    2
-#define CURL_POLL_INOUT  3
-#define CURL_POLL_REMOVE 4
-
-#define CURL_SOCKET_TIMEOUT CURL_SOCKET_BAD
-
-#define CURL_CSELECT_IN   0x01
-#define CURL_CSELECT_OUT  0x02
-#define CURL_CSELECT_ERR  0x04
-
-typedef int (*curl_socket_callback)(CURL *easy,      /* easy handle */
-                                    curl_socket_t s, /* socket */
-                                    int what,        /* see above */
-                                    void *userp,     /* private callback
-                                                        pointer */
-                                    void *socketp);  /* private socket
-                                                        pointer */
-/*
- * Name:    curl_multi_timer_callback
- *
- * Desc:    Called by libcurl whenever the library detects a change in the
- *          maximum number of milliseconds the app is allowed to wait before
- *          curl_multi_socket() or curl_multi_perform() must be called
- *          (to allow libcurl's timed events to take place).
- *
- * Returns: The callback should return zero.
- */
-typedef int (*curl_multi_timer_callback)(CURLM *multi,    /* multi handle */
-                                         long timeout_ms, /* see above */
-                                         void *userp);    /* private callback
-                                                             pointer */
-
-CURL_EXTERN CURLMcode curl_multi_socket(CURLM *multi_handle, curl_socket_t s,
-                                        int *running_handles);
-
-CURL_EXTERN CURLMcode curl_multi_socket_action(CURLM *multi_handle,
-                                               curl_socket_t s,
-                                               int ev_bitmask,
-                                               int *running_handles);
-
-CURL_EXTERN CURLMcode curl_multi_socket_all(CURLM *multi_handle,
-                                            int *running_handles);
-
-#ifndef CURL_ALLOW_OLD_MULTI_SOCKET
-/* This macro below was added in 7.16.3 to push users who recompile to use
-   the new curl_multi_socket_action() instead of the old curl_multi_socket()
-*/
-#define curl_multi_socket(x,y,z) curl_multi_socket_action(x,y,0,z)
-#endif
-
-/*
- * Name:    curl_multi_timeout()
- *
- * Desc:    Returns the maximum number of milliseconds the app is allowed to
- *          wait before curl_multi_socket() or curl_multi_perform() must be
- *          called (to allow libcurl's timed events to take place).
- *
- * Returns: CURLM error code.
- */
-CURL_EXTERN CURLMcode curl_multi_timeout(CURLM *multi_handle,
-                                         long *milliseconds);
-
-#undef CINIT /* re-using the same name as in curl.h */
-
-#ifdef CURL_ISOCPP
-#define CINIT(name,type,num) CURLMOPT_ ## name = CURLOPTTYPE_ ## type + num
-#else
-/* The macro "##" is ISO C, we assume pre-ISO C doesn't support it. */
-#define LONG          CURLOPTTYPE_LONG
-#define OBJECTPOINT   CURLOPTTYPE_OBJECTPOINT
-#define FUNCTIONPOINT CURLOPTTYPE_FUNCTIONPOINT
-#define OFF_T         CURLOPTTYPE_OFF_T
-#define CINIT(name,type,number) CURLMOPT_/**/name = type + number
-#endif
-
-typedef enum {
-  /* This is the socket callback function pointer */
-  CINIT(SOCKETFUNCTION, FUNCTIONPOINT, 1),
-
-  /* This is the argument passed to the socket callback */
-  CINIT(SOCKETDATA, OBJECTPOINT, 2),
-
-    /* set to 1 to enable pipelining for this multi handle */
-  CINIT(PIPELINING, LONG, 3),
-
-   /* This is the timer callback function pointer */
-  CINIT(TIMERFUNCTION, FUNCTIONPOINT, 4),
-
-  /* This is the argument passed to the timer callback */
-  CINIT(TIMERDATA, OBJECTPOINT, 5),
-
-  /* maximum number of entries in the connection cache */
-  CINIT(MAXCONNECTS, LONG, 6),
-
-  CURLMOPT_LASTENTRY /* the last unused */
-} CURLMoption;
-
-
-/*
- * Name:    curl_multi_setopt()
- *
- * Desc:    Sets options for the multi handle.
- *
- * Returns: CURLM error code.
- */
-CURL_EXTERN CURLMcode curl_multi_setopt(CURLM *multi_handle,
-                                        CURLMoption option, ...);
-
-
-/*
- * Name:    curl_multi_assign()
- *
- * Desc:    This function sets an association in the multi handle between the
- *          given socket and a private pointer of the application. This is
- *          (only) useful for curl_multi_socket uses.
- *
- * Returns: CURLM error code.
- */
-CURL_EXTERN CURLMcode curl_multi_assign(CURLM *multi_handle,
-                                        curl_socket_t sockfd, void *sockp);
-
-#ifdef __cplusplus
-} /* end of extern "C" */
-#endif
-
-#endif
diff --git a/common/curl/stdcheaders.h b/common/curl/stdcheaders.h
deleted file mode 100644
index ad82ef63..00000000
--- a/common/curl/stdcheaders.h
+++ /dev/null
@@ -1,33 +0,0 @@
-#ifndef __STDC_HEADERS_H
-#define __STDC_HEADERS_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-
-#include <sys/types.h>
-
-size_t fread (void *, size_t, size_t, FILE *);
-size_t fwrite (const void *, size_t, size_t, FILE *);
-
-int strcasecmp(const char *, const char *);
-int strncasecmp(const char *, const char *, size_t);
-
-#endif /* __STDC_HEADERS_H */
diff --git a/common/curl/typecheck-gcc.h b/common/curl/typecheck-gcc.h
deleted file mode 100644
index e6f74a95..00000000
--- a/common/curl/typecheck-gcc.h
+++ /dev/null
@@ -1,584 +0,0 @@
-#ifndef __CURL_TYPECHECK_GCC_H
-#define __CURL_TYPECHECK_GCC_H
-/***************************************************************************
- *                                  _   _ ____  _
- *  Project                     ___| | | |  _ \| |
- *                             / __| | | | |_) | |
- *                            | (__| |_| |  _ <| |___
- *                             \___|\___/|_| \_\_____|
- *
- * Copyright (C) 1998 - 2010, Daniel Stenberg, <daniel@haxx.se>, et al.
- *
- * This software is licensed as described in the file COPYING, which
- * you should have received as part of this distribution. The terms
- * are also available at http://curl.haxx.se/docs/copyright.html.
- *
- * You may opt to use, copy, modify, merge, publish, distribute and/or sell
- * copies of the Software, and permit persons to whom the Software is
- * furnished to do so, under the terms of the COPYING file.
- *
- * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY
- * KIND, either express or implied.
- *
- ***************************************************************************/
-
-/* wraps curl_easy_setopt() with typechecking */
-
-/* To add a new kind of warning, add an
- *   if(_curl_is_sometype_option(_curl_opt))
- *     if(!_curl_is_sometype(value))
- *       _curl_easy_setopt_err_sometype();
- * block and define _curl_is_sometype_option, _curl_is_sometype and
- * _curl_easy_setopt_err_sometype below
- *
- * NOTE: We use two nested 'if' statements here instead of the && operator, in
- *       order to work around gcc bug #32061.  It affects only gcc 4.3.x/4.4.x
- *       when compiling with -Wlogical-op.
- *
- * To add an option that uses the same type as an existing option, you'll just
- * need to extend the appropriate _curl_*_option macro
- */
-#define curl_easy_setopt(handle, option, value)                               \
-__extension__ ({                                                              \
-  __typeof__ (option) _curl_opt = option;                                     \
-  if (__builtin_constant_p(_curl_opt)) {                                      \
-    if (_curl_is_long_option(_curl_opt))                                      \
-      if (!_curl_is_long(value))                                              \
-        _curl_easy_setopt_err_long();                                         \
-    if (_curl_is_off_t_option(_curl_opt))                                     \
-      if (!_curl_is_off_t(value))                                             \
-        _curl_easy_setopt_err_curl_off_t();                                   \
-    if (_curl_is_string_option(_curl_opt))                                    \
-      if (!_curl_is_string(value))                                            \
-        _curl_easy_setopt_err_string();                                       \
-    if (_curl_is_write_cb_option(_curl_opt))                                  \
-      if (!_curl_is_write_cb(value))                                          \
-        _curl_easy_setopt_err_write_callback();                               \
-    if ((_curl_opt) == CURLOPT_READFUNCTION)                                  \
-      if (!_curl_is_read_cb(value))                                           \
-        _curl_easy_setopt_err_read_cb();                                      \
-    if ((_curl_opt) == CURLOPT_IOCTLFUNCTION)                                 \
-      if (!_curl_is_ioctl_cb(value))                                          \
-        _curl_easy_setopt_err_ioctl_cb();                                     \
-    if ((_curl_opt) == CURLOPT_SOCKOPTFUNCTION)                               \
-      if (!_curl_is_sockopt_cb(value))                                        \
-        _curl_easy_setopt_err_sockopt_cb();                                   \
-    if ((_curl_opt) == CURLOPT_OPENSOCKETFUNCTION)                            \
-      if (!_curl_is_opensocket_cb(value))                                     \
-        _curl_easy_setopt_err_opensocket_cb();                                \
-    if ((_curl_opt) == CURLOPT_PROGRESSFUNCTION)                              \
-      if (!_curl_is_progress_cb(value))                                       \
-        _curl_easy_setopt_err_progress_cb();                                  \
-    if ((_curl_opt) == CURLOPT_DEBUGFUNCTION)                                 \
-      if (!_curl_is_debug_cb(value))                                          \
-        _curl_easy_setopt_err_debug_cb();                                     \
-    if ((_curl_opt) == CURLOPT_SSL_CTX_FUNCTION)                              \
-      if (!_curl_is_ssl_ctx_cb(value))                                        \
-        _curl_easy_setopt_err_ssl_ctx_cb();                                   \
-    if (_curl_is_conv_cb_option(_curl_opt))                                   \
-      if (!_curl_is_conv_cb(value))                                           \
-        _curl_easy_setopt_err_conv_cb();                                      \
-    if ((_curl_opt) == CURLOPT_SEEKFUNCTION)                                  \
-      if (!_curl_is_seek_cb(value))                                           \
-        _curl_easy_setopt_err_seek_cb();                                      \
-    if (_curl_is_cb_data_option(_curl_opt))                                   \
-      if (!_curl_is_cb_data(value))                                           \
-        _curl_easy_setopt_err_cb_data();                                      \
-    if ((_curl_opt) == CURLOPT_ERRORBUFFER)                                   \
-      if (!_curl_is_error_buffer(value))                                      \
-        _curl_easy_setopt_err_error_buffer();                                 \
-    if ((_curl_opt) == CURLOPT_STDERR)                                        \
-      if (!_curl_is_FILE(value))                                              \
-        _curl_easy_setopt_err_FILE();                                         \
-    if (_curl_is_postfields_option(_curl_opt))                                \
-      if (!_curl_is_postfields(value))                                        \
-        _curl_easy_setopt_err_postfields();                                   \
-    if ((_curl_opt) == CURLOPT_HTTPPOST)                                      \
-      if (!_curl_is_arr((value), struct curl_httppost))                       \
-        _curl_easy_setopt_err_curl_httpost();                                 \
-    if (_curl_is_slist_option(_curl_opt))                                     \
-      if (!_curl_is_arr((value), struct curl_slist))                          \
-        _curl_easy_setopt_err_curl_slist();                                   \
-    if ((_curl_opt) == CURLOPT_SHARE)                                         \
-      if (!_curl_is_ptr((value), CURLSH))                                     \
-        _curl_easy_setopt_err_CURLSH();                                       \
-  }                                                                           \
-  curl_easy_setopt(handle, _curl_opt, value);                                 \
-})
-
-/* wraps curl_easy_getinfo() with typechecking */
-/* FIXME: don't allow const pointers */
-#define curl_easy_getinfo(handle, info, arg)                                  \
-__extension__ ({                                                              \
-  __typeof__ (info) _curl_info = info;                                        \
-  if (__builtin_constant_p(_curl_info)) {                                     \
-    if (_curl_is_string_info(_curl_info))                                     \
-      if (!_curl_is_arr((arg), char *))                                       \
-        _curl_easy_getinfo_err_string();                                      \
-    if (_curl_is_long_info(_curl_info))                                       \
-      if (!_curl_is_arr((arg), long))                                         \
-        _curl_easy_getinfo_err_long();                                        \
-    if (_curl_is_double_info(_curl_info))                                     \
-      if (!_curl_is_arr((arg), double))                                       \
-        _curl_easy_getinfo_err_double();                                      \
-    if (_curl_is_slist_info(_curl_info))                                      \
-      if (!_curl_is_arr((arg), struct curl_slist *))                          \
-        _curl_easy_getinfo_err_curl_slist();                                  \
-  }                                                                           \
-  curl_easy_getinfo(handle, _curl_info, arg);                                 \
-})
-
-/* TODO: typechecking for curl_share_setopt() and curl_multi_setopt(),
- * for now just make sure that the functions are called with three
- * arguments
- */
-#define curl_share_setopt(share,opt,param) curl_share_setopt(share,opt,param)
-#define curl_multi_setopt(handle,opt,param) curl_multi_setopt(handle,opt,param)
-
-
-/* the actual warnings, triggered by calling the _curl_easy_setopt_err*
- * functions */
-
-/* To define a new warning, use _CURL_WARNING(identifier, "message") */
-#define _CURL_WARNING(id, message)                                            \
-  static void __attribute__((warning(message))) __attribute__((unused))       \
-  __attribute__((noinline)) id(void) { __asm__(""); }
-
-_CURL_WARNING(_curl_easy_setopt_err_long,
-  "curl_easy_setopt expects a long argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_curl_off_t,
-  "curl_easy_setopt expects a curl_off_t argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_string,
-  "curl_easy_setopt expects a string (char* or char[]) argument for this option"
-  )
-_CURL_WARNING(_curl_easy_setopt_err_write_callback,
-  "curl_easy_setopt expects a curl_write_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_read_cb,
-  "curl_easy_setopt expects a curl_read_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_ioctl_cb,
-  "curl_easy_setopt expects a curl_ioctl_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_sockopt_cb,
-  "curl_easy_setopt expects a curl_sockopt_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_opensocket_cb,
-  "curl_easy_setopt expects a curl_opensocket_callback argument for this option"
-  )
-_CURL_WARNING(_curl_easy_setopt_err_progress_cb,
-  "curl_easy_setopt expects a curl_progress_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_debug_cb,
-  "curl_easy_setopt expects a curl_debug_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_ssl_ctx_cb,
-  "curl_easy_setopt expects a curl_ssl_ctx_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_conv_cb,
-  "curl_easy_setopt expects a curl_conv_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_seek_cb,
-  "curl_easy_setopt expects a curl_seek_callback argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_cb_data,
-  "curl_easy_setopt expects a private data pointer as argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_error_buffer,
-  "curl_easy_setopt expects a char buffer of CURL_ERROR_SIZE as argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_FILE,
-  "curl_easy_setopt expects a FILE* argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_postfields,
-  "curl_easy_setopt expects a void* or char* argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_curl_httpost,
-  "curl_easy_setopt expects a struct curl_httppost* argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_curl_slist,
-  "curl_easy_setopt expects a struct curl_slist* argument for this option")
-_CURL_WARNING(_curl_easy_setopt_err_CURLSH,
-  "curl_easy_setopt expects a CURLSH* argument for this option")
-
-_CURL_WARNING(_curl_easy_getinfo_err_string,
-  "curl_easy_getinfo expects a pointer to char * for this info")
-_CURL_WARNING(_curl_easy_getinfo_err_long,
-  "curl_easy_getinfo expects a pointer to long for this info")
-_CURL_WARNING(_curl_easy_getinfo_err_double,
-  "curl_easy_getinfo expects a pointer to double for this info")
-_CURL_WARNING(_curl_easy_getinfo_err_curl_slist,
-  "curl_easy_getinfo expects a pointer to struct curl_slist * for this info")
-
-/* groups of curl_easy_setops options that take the same type of argument */
-
-/* To add a new option to one of the groups, just add
- *   (option) == CURLOPT_SOMETHING
- * to the or-expression. If the option takes a long or curl_off_t, you don't
- * have to do anything
- */
-
-/* evaluates to true if option takes a long argument */
-#define _curl_is_long_option(option)                                          \
-  (0 < (option) && (option) < CURLOPTTYPE_OBJECTPOINT)
-
-#define _curl_is_off_t_option(option)                                         \
-  ((option) > CURLOPTTYPE_OFF_T)
-
-/* evaluates to true if option takes a char* argument */
-#define _curl_is_string_option(option)                                        \
-  ((option) == CURLOPT_URL ||                                                 \
-   (option) == CURLOPT_PROXY ||                                               \
-   (option) == CURLOPT_INTERFACE ||                                           \
-   (option) == CURLOPT_NETRC_FILE ||                                          \
-   (option) == CURLOPT_USERPWD ||                                             \
-   (option) == CURLOPT_USERNAME ||                                            \
-   (option) == CURLOPT_PASSWORD ||                                            \
-   (option) == CURLOPT_PROXYUSERPWD ||                                        \
-   (option) == CURLOPT_PROXYUSERNAME ||                                       \
-   (option) == CURLOPT_PROXYPASSWORD ||                                       \
-   (option) == CURLOPT_NOPROXY ||                                             \
-   (option) == CURLOPT_ENCODING ||                                            \
-   (option) == CURLOPT_REFERER ||                                             \
-   (option) == CURLOPT_USERAGENT ||                                           \
-   (option) == CURLOPT_COOKIE ||                                              \
-   (option) == CURLOPT_COOKIEFILE ||                                          \
-   (option) == CURLOPT_COOKIEJAR ||                                           \
-   (option) == CURLOPT_COOKIELIST ||                                          \
-   (option) == CURLOPT_FTPPORT ||                                             \
-   (option) == CURLOPT_FTP_ALTERNATIVE_TO_USER ||                             \
-   (option) == CURLOPT_FTP_ACCOUNT ||                                         \
-   (option) == CURLOPT_RANGE ||                                               \
-   (option) == CURLOPT_CUSTOMREQUEST ||                                       \
-   (option) == CURLOPT_SSLCERT ||                                             \
-   (option) == CURLOPT_SSLCERTTYPE ||                                         \
-   (option) == CURLOPT_SSLKEY ||                                              \
-   (option) == CURLOPT_SSLKEYTYPE ||                                          \
-   (option) == CURLOPT_KEYPASSWD ||                                           \
-   (option) == CURLOPT_SSLENGINE ||                                           \
-   (option) == CURLOPT_CAINFO ||                                              \
-   (option) == CURLOPT_CAPATH ||                                              \
-   (option) == CURLOPT_RANDOM_FILE ||                                         \
-   (option) == CURLOPT_EGDSOCKET ||                                           \
-   (option) == CURLOPT_SSL_CIPHER_LIST ||                                     \
-   (option) == CURLOPT_KRBLEVEL ||                                            \
-   (option) == CURLOPT_SSH_HOST_PUBLIC_KEY_MD5 ||                             \
-   (option) == CURLOPT_SSH_PUBLIC_KEYFILE ||                                  \
-   (option) == CURLOPT_SSH_PRIVATE_KEYFILE ||                                 \
-   (option) == CURLOPT_CRLFILE ||                                             \
-   (option) == CURLOPT_ISSUERCERT ||                                          \
-   (option) == CURLOPT_SOCKS5_GSSAPI_SERVICE ||                               \
-   (option) == CURLOPT_SSH_KNOWNHOSTS ||                                      \
-   (option) == CURLOPT_MAIL_FROM ||                                           \
-   (option) == CURLOPT_RTSP_SESSION_ID ||                                     \
-   (option) == CURLOPT_RTSP_STREAM_URI ||                                     \
-   (option) == CURLOPT_RTSP_TRANSPORT ||                                      \
-   0)
-
-/* evaluates to true if option takes a curl_write_callback argument */
-#define _curl_is_write_cb_option(option)                                      \
-  ((option) == CURLOPT_HEADERFUNCTION ||                                      \
-   (option) == CURLOPT_WRITEFUNCTION)
-
-/* evaluates to true if option takes a curl_conv_callback argument */
-#define _curl_is_conv_cb_option(option)                                       \
-  ((option) == CURLOPT_CONV_TO_NETWORK_FUNCTION ||                            \
-   (option) == CURLOPT_CONV_FROM_NETWORK_FUNCTION ||                          \
-   (option) == CURLOPT_CONV_FROM_UTF8_FUNCTION)
-
-/* evaluates to true if option takes a data argument to pass to a callback */
-#define _curl_is_cb_data_option(option)                                       \
-  ((option) == CURLOPT_WRITEDATA ||                                           \
-   (option) == CURLOPT_READDATA ||                                            \
-   (option) == CURLOPT_IOCTLDATA ||                                           \
-   (option) == CURLOPT_SOCKOPTDATA ||                                         \
-   (option) == CURLOPT_OPENSOCKETDATA ||                                      \
-   (option) == CURLOPT_PROGRESSDATA ||                                        \
-   (option) == CURLOPT_WRITEHEADER ||                                         \
-   (option) == CURLOPT_DEBUGDATA ||                                           \
-   (option) == CURLOPT_SSL_CTX_DATA ||                                        \
-   (option) == CURLOPT_SEEKDATA ||                                            \
-   (option) == CURLOPT_PRIVATE ||                                             \
-   (option) == CURLOPT_SSH_KEYDATA ||                                         \
-   (option) == CURLOPT_INTERLEAVEDATA ||                                      \
-   (option) == CURLOPT_CHUNK_DATA ||                                          \
-   (option) == CURLOPT_FNMATCH_DATA ||                                        \
-   0)
-
-/* evaluates to true if option takes a POST data argument (void* or char*) */
-#define _curl_is_postfields_option(option)                                    \
-  ((option) == CURLOPT_POSTFIELDS ||                                          \
-   (option) == CURLOPT_COPYPOSTFIELDS ||                                      \
-   0)
-
-/* evaluates to true if option takes a struct curl_slist * argument */
-#define _curl_is_slist_option(option)                                         \
-  ((option) == CURLOPT_HTTPHEADER ||                                          \
-   (option) == CURLOPT_HTTP200ALIASES ||                                      \
-   (option) == CURLOPT_QUOTE ||                                               \
-   (option) == CURLOPT_POSTQUOTE ||                                           \
-   (option) == CURLOPT_PREQUOTE ||                                            \
-   (option) == CURLOPT_TELNETOPTIONS ||                                       \
-   (option) == CURLOPT_MAIL_RCPT ||                                           \
-   0)
-
-/* groups of curl_easy_getinfo infos that take the same type of argument */
-
-/* evaluates to true if info expects a pointer to char * argument */
-#define _curl_is_string_info(info)                                            \
-  (CURLINFO_STRING < (info) && (info) < CURLINFO_LONG)
-
-/* evaluates to true if info expects a pointer to long argument */
-#define _curl_is_long_info(info)                                              \
-  (CURLINFO_LONG < (info) && (info) < CURLINFO_DOUBLE)
-
-/* evaluates to true if info expects a pointer to double argument */
-#define _curl_is_double_info(info)                                            \
-  (CURLINFO_DOUBLE < (info) && (info) < CURLINFO_SLIST)
-
-/* true if info expects a pointer to struct curl_slist * argument */
-#define _curl_is_slist_info(info)                                             \
-  (CURLINFO_SLIST < (info))
-
-
-/* typecheck helpers -- check whether given expression has requested type*/
-
-/* For pointers, you can use the _curl_is_ptr/_curl_is_arr macros,
- * otherwise define a new macro. Search for __builtin_types_compatible_p
- * in the GCC manual.
- * NOTE: these macros MUST NOT EVALUATE their arguments! The argument is
- * the actual expression passed to the curl_easy_setopt macro. This
- * means that you can only apply the sizeof and __typeof__ operators, no
- * == or whatsoever.
- */
-
-/* XXX: should evaluate to true iff expr is a pointer */
-#define _curl_is_any_ptr(expr)                                                \
-  (sizeof(expr) == sizeof(void*))
-
-/* evaluates to true if expr is NULL */
-/* XXX: must not evaluate expr, so this check is not accurate */
-#define _curl_is_NULL(expr)                                                   \
-  (__builtin_types_compatible_p(__typeof__(expr), __typeof__(NULL)))
-
-/* evaluates to true if expr is type*, const type* or NULL */
-#define _curl_is_ptr(expr, type)                                              \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), type *) ||                  \
-   __builtin_types_compatible_p(__typeof__(expr), const type *))
-
-/* evaluates to true if expr is one of type[], type*, NULL or const type* */
-#define _curl_is_arr(expr, type)                                              \
-  (_curl_is_ptr((expr), type) ||                                              \
-   __builtin_types_compatible_p(__typeof__(expr), type []))
-
-/* evaluates to true if expr is a string */
-#define _curl_is_string(expr)                                                 \
-  (_curl_is_arr((expr), char) ||                                              \
-   _curl_is_arr((expr), signed char) ||                                       \
-   _curl_is_arr((expr), unsigned char))
-
-/* evaluates to true if expr is a long (no matter the signedness)
- * XXX: for now, int is also accepted (and therefore short and char, which
- * are promoted to int when passed to a variadic function) */
-#define _curl_is_long(expr)                                                   \
-  (__builtin_types_compatible_p(__typeof__(expr), long) ||                    \
-   __builtin_types_compatible_p(__typeof__(expr), signed long) ||             \
-   __builtin_types_compatible_p(__typeof__(expr), unsigned long) ||           \
-   __builtin_types_compatible_p(__typeof__(expr), int) ||                     \
-   __builtin_types_compatible_p(__typeof__(expr), signed int) ||              \
-   __builtin_types_compatible_p(__typeof__(expr), unsigned int) ||            \
-   __builtin_types_compatible_p(__typeof__(expr), short) ||                   \
-   __builtin_types_compatible_p(__typeof__(expr), signed short) ||            \
-   __builtin_types_compatible_p(__typeof__(expr), unsigned short) ||          \
-   __builtin_types_compatible_p(__typeof__(expr), char) ||                    \
-   __builtin_types_compatible_p(__typeof__(expr), signed char) ||             \
-   __builtin_types_compatible_p(__typeof__(expr), unsigned char))
-
-/* evaluates to true if expr is of type curl_off_t */
-#define _curl_is_off_t(expr)                                                  \
-  (__builtin_types_compatible_p(__typeof__(expr), curl_off_t))
-
-/* evaluates to true if expr is abuffer suitable for CURLOPT_ERRORBUFFER */
-/* XXX: also check size of an char[] array? */
-#define _curl_is_error_buffer(expr)                                           \
-  (__builtin_types_compatible_p(__typeof__(expr), char *) ||                  \
-   __builtin_types_compatible_p(__typeof__(expr), char[]))
-
-/* evaluates to true if expr is of type (const) void* or (const) FILE* */
-#if 0
-#define _curl_is_cb_data(expr)                                                \
-  (_curl_is_ptr((expr), void) ||                                              \
-   _curl_is_ptr((expr), FILE))
-#else /* be less strict */
-#define _curl_is_cb_data(expr)                                                \
-  _curl_is_any_ptr(expr)
-#endif
-
-/* evaluates to true if expr is of type FILE* */
-#define _curl_is_FILE(expr)                                                   \
-  (__builtin_types_compatible_p(__typeof__(expr), FILE *))
-
-/* evaluates to true if expr can be passed as POST data (void* or char*) */
-#define _curl_is_postfields(expr)                                             \
-  (_curl_is_ptr((expr), void) ||                                              \
-   _curl_is_arr((expr), char))
-
-/* FIXME: the whole callback checking is messy...
- * The idea is to tolerate char vs. void and const vs. not const
- * pointers in arguments at least
- */
-/* helper: __builtin_types_compatible_p distinguishes between functions and
- * function pointers, hide it */
-#define _curl_callback_compatible(func, type)                                 \
-  (__builtin_types_compatible_p(__typeof__(func), type) ||                    \
-   __builtin_types_compatible_p(__typeof__(func), type*))
-
-/* evaluates to true if expr is of type curl_read_callback or "similar" */
-#define _curl_is_read_cb(expr)                                          \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), __typeof__(fread)) ||       \
-   __builtin_types_compatible_p(__typeof__(expr), curl_read_callback) ||      \
-   _curl_callback_compatible((expr), _curl_read_callback1) ||                 \
-   _curl_callback_compatible((expr), _curl_read_callback2) ||                 \
-   _curl_callback_compatible((expr), _curl_read_callback3) ||                 \
-   _curl_callback_compatible((expr), _curl_read_callback4) ||                 \
-   _curl_callback_compatible((expr), _curl_read_callback5) ||                 \
-   _curl_callback_compatible((expr), _curl_read_callback6))
-typedef size_t (_curl_read_callback1)(char *, size_t, size_t, void*);
-typedef size_t (_curl_read_callback2)(char *, size_t, size_t, const void*);
-typedef size_t (_curl_read_callback3)(char *, size_t, size_t, FILE*);
-typedef size_t (_curl_read_callback4)(void *, size_t, size_t, void*);
-typedef size_t (_curl_read_callback5)(void *, size_t, size_t, const void*);
-typedef size_t (_curl_read_callback6)(void *, size_t, size_t, FILE*);
-
-/* evaluates to true if expr is of type curl_write_callback or "similar" */
-#define _curl_is_write_cb(expr)                                               \
-  (_curl_is_read_cb(expr) ||                                            \
-   __builtin_types_compatible_p(__typeof__(expr), __typeof__(fwrite)) ||      \
-   __builtin_types_compatible_p(__typeof__(expr), curl_write_callback) ||     \
-   _curl_callback_compatible((expr), _curl_write_callback1) ||                \
-   _curl_callback_compatible((expr), _curl_write_callback2) ||                \
-   _curl_callback_compatible((expr), _curl_write_callback3) ||                \
-   _curl_callback_compatible((expr), _curl_write_callback4) ||                \
-   _curl_callback_compatible((expr), _curl_write_callback5) ||                \
-   _curl_callback_compatible((expr), _curl_write_callback6))
-typedef size_t (_curl_write_callback1)(const char *, size_t, size_t, void*);
-typedef size_t (_curl_write_callback2)(const char *, size_t, size_t,
-                                       const void*);
-typedef size_t (_curl_write_callback3)(const char *, size_t, size_t, FILE*);
-typedef size_t (_curl_write_callback4)(const void *, size_t, size_t, void*);
-typedef size_t (_curl_write_callback5)(const void *, size_t, size_t,
-                                       const void*);
-typedef size_t (_curl_write_callback6)(const void *, size_t, size_t, FILE*);
-
-/* evaluates to true if expr is of type curl_ioctl_callback or "similar" */
-#define _curl_is_ioctl_cb(expr)                                         \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_ioctl_callback) ||     \
-   _curl_callback_compatible((expr), _curl_ioctl_callback1) ||                \
-   _curl_callback_compatible((expr), _curl_ioctl_callback2) ||                \
-   _curl_callback_compatible((expr), _curl_ioctl_callback3) ||                \
-   _curl_callback_compatible((expr), _curl_ioctl_callback4))
-typedef curlioerr (_curl_ioctl_callback1)(CURL *, int, void*);
-typedef curlioerr (_curl_ioctl_callback2)(CURL *, int, const void*);
-typedef curlioerr (_curl_ioctl_callback3)(CURL *, curliocmd, void*);
-typedef curlioerr (_curl_ioctl_callback4)(CURL *, curliocmd, const void*);
-
-/* evaluates to true if expr is of type curl_sockopt_callback or "similar" */
-#define _curl_is_sockopt_cb(expr)                                       \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_sockopt_callback) ||   \
-   _curl_callback_compatible((expr), _curl_sockopt_callback1) ||              \
-   _curl_callback_compatible((expr), _curl_sockopt_callback2))
-typedef int (_curl_sockopt_callback1)(void *, curl_socket_t, curlsocktype);
-typedef int (_curl_sockopt_callback2)(const void *, curl_socket_t,
-                                      curlsocktype);
-
-/* evaluates to true if expr is of type curl_opensocket_callback or "similar" */
-#define _curl_is_opensocket_cb(expr)                                    \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_opensocket_callback) ||\
-   _curl_callback_compatible((expr), _curl_opensocket_callback1) ||           \
-   _curl_callback_compatible((expr), _curl_opensocket_callback2) ||           \
-   _curl_callback_compatible((expr), _curl_opensocket_callback3) ||           \
-   _curl_callback_compatible((expr), _curl_opensocket_callback4))
-typedef curl_socket_t (_curl_opensocket_callback1)
-  (void *, curlsocktype, struct curl_sockaddr *);
-typedef curl_socket_t (_curl_opensocket_callback2)
-  (void *, curlsocktype, const struct curl_sockaddr *);
-typedef curl_socket_t (_curl_opensocket_callback3)
-  (const void *, curlsocktype, struct curl_sockaddr *);
-typedef curl_socket_t (_curl_opensocket_callback4)
-  (const void *, curlsocktype, const struct curl_sockaddr *);
-
-/* evaluates to true if expr is of type curl_progress_callback or "similar" */
-#define _curl_is_progress_cb(expr)                                      \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_progress_callback) ||  \
-   _curl_callback_compatible((expr), _curl_progress_callback1) ||             \
-   _curl_callback_compatible((expr), _curl_progress_callback2))
-typedef int (_curl_progress_callback1)(void *,
-    double, double, double, double);
-typedef int (_curl_progress_callback2)(const void *,
-    double, double, double, double);
-
-/* evaluates to true if expr is of type curl_debug_callback or "similar" */
-#define _curl_is_debug_cb(expr)                                         \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_debug_callback) ||     \
-   _curl_callback_compatible((expr), _curl_debug_callback1) ||                \
-   _curl_callback_compatible((expr), _curl_debug_callback2) ||                \
-   _curl_callback_compatible((expr), _curl_debug_callback3) ||                \
-   _curl_callback_compatible((expr), _curl_debug_callback4))
-typedef int (_curl_debug_callback1) (CURL *,
-    curl_infotype, char *, size_t, void *);
-typedef int (_curl_debug_callback2) (CURL *,
-    curl_infotype, char *, size_t, const void *);
-typedef int (_curl_debug_callback3) (CURL *,
-    curl_infotype, const char *, size_t, void *);
-typedef int (_curl_debug_callback4) (CURL *,
-    curl_infotype, const char *, size_t, const void *);
-
-/* evaluates to true if expr is of type curl_ssl_ctx_callback or "similar" */
-/* this is getting even messier... */
-#define _curl_is_ssl_ctx_cb(expr)                                       \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_ssl_ctx_callback) ||   \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback1) ||              \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback2) ||              \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback3) ||              \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback4) ||              \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback5) ||              \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback6) ||              \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback7) ||              \
-   _curl_callback_compatible((expr), _curl_ssl_ctx_callback8))
-typedef CURLcode (_curl_ssl_ctx_callback1)(CURL *, void *, void *);
-typedef CURLcode (_curl_ssl_ctx_callback2)(CURL *, void *, const void *);
-typedef CURLcode (_curl_ssl_ctx_callback3)(CURL *, const void *, void *);
-typedef CURLcode (_curl_ssl_ctx_callback4)(CURL *, const void *, const void *);
-#ifdef HEADER_SSL_H
-/* hack: if we included OpenSSL's ssl.h, we know about SSL_CTX
- * this will of course break if we're included before OpenSSL headers...
- */
-typedef CURLcode (_curl_ssl_ctx_callback5)(CURL *, SSL_CTX, void *);
-typedef CURLcode (_curl_ssl_ctx_callback6)(CURL *, SSL_CTX, const void *);
-typedef CURLcode (_curl_ssl_ctx_callback7)(CURL *, const SSL_CTX, void *);
-typedef CURLcode (_curl_ssl_ctx_callback8)(CURL *, const SSL_CTX, const void *);
-#else
-typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback5;
-typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback6;
-typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback7;
-typedef _curl_ssl_ctx_callback1 _curl_ssl_ctx_callback8;
-#endif
-
-/* evaluates to true if expr is of type curl_conv_callback or "similar" */
-#define _curl_is_conv_cb(expr)                                          \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_conv_callback) ||      \
-   _curl_callback_compatible((expr), _curl_conv_callback1) ||                 \
-   _curl_callback_compatible((expr), _curl_conv_callback2) ||                 \
-   _curl_callback_compatible((expr), _curl_conv_callback3) ||                 \
-   _curl_callback_compatible((expr), _curl_conv_callback4))
-typedef CURLcode (*_curl_conv_callback1)(char *, size_t length);
-typedef CURLcode (*_curl_conv_callback2)(const char *, size_t length);
-typedef CURLcode (*_curl_conv_callback3)(void *, size_t length);
-typedef CURLcode (*_curl_conv_callback4)(const void *, size_t length);
-
-/* evaluates to true if expr is of type curl_seek_callback or "similar" */
-#define _curl_is_seek_cb(expr)                                          \
-  (_curl_is_NULL(expr) ||                                                     \
-   __builtin_types_compatible_p(__typeof__(expr), curl_seek_callback) ||      \
-   _curl_callback_compatible((expr), _curl_seek_callback1) ||                 \
-   _curl_callback_compatible((expr), _curl_seek_callback2))
-typedef CURLcode (*_curl_seek_callback1)(void *, curl_off_t, int);
-typedef CURLcode (*_curl_seek_callback2)(const void *, curl_off_t, int);
-
-
-#endif /* __CURL_TYPECHECK_GCC_H */
diff --git a/common/curl/types.h b/common/curl/types.h
deleted file mode 100644
index d37d6ae9..00000000
--- a/common/curl/types.h
+++ /dev/null
@@ -1 +0,0 @@
-/* not used */
diff --git a/common/sse2neon.h b/common/sse2neon.h
new file mode 100644
index 00000000..9e512acf
--- /dev/null
+++ b/common/sse2neon.h
@@ -0,0 +1,7598 @@
+#ifndef SSE2NEON_H
+#define SSE2NEON_H
+
+// This header file provides a simple API translation layer
+// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
+//
+// This header file does not yet translate all of the SSE intrinsics.
+//
+// Contributors to this work are:
+//   John W. Ratcliff <jratcliffscarab@gmail.com>
+//   Brandon Rowlett <browlett@nvidia.com>
+//   Ken Fast <kfast@gdeb.com>
+//   Eric van Beurden <evanbeurden@nvidia.com>
+//   Alexander Potylitsin <apotylitsin@nvidia.com>
+//   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+//   Jim Huang <jserv@biilabs.io>
+//   Mark Cheng <marktwtn@biilabs.io>
+//   Malcolm James MacLeod <malcolm@gulden.com>
+//   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+//   Sebastian Pop <spop@amazon.com>
+//   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+//   Danila Kutenin <danilak@google.com>
+//   François Turban (JishinMaster) <francois.turban@gmail.com>
+//   Pei-Hsuan Hung <afcidk@gmail.com>
+//   Yang-Hao Yuan <yanghau@biilabs.io>
+//   Syoyo Fujita <syoyo@lighttransport.com>
+//   Brecht Van Lommel <brecht@blender.org>
+
+/*
+ * sse2neon is freely redistributable under the MIT License.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Tunable configurations */
+
+/* Enable precise implementation of math operations
+ * This would slow down the computation a bit, but gives consistent result with
+ * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result)
+ */
+/* _mm_min_ps and _mm_max_ps */
+#ifndef SSE2NEON_PRECISE_MINMAX
+#define SSE2NEON_PRECISE_MINMAX (0)
+#endif
+/* _mm_rcp_ps and _mm_div_ps */
+#ifndef SSE2NEON_PRECISE_DIV
+#define SSE2NEON_PRECISE_DIV (0)
+#endif
+/* _mm_sqrt_ps and _mm_rsqrt_ps */
+#ifndef SSE2NEON_PRECISE_SQRT
+#define SSE2NEON_PRECISE_SQRT (0)
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma push_macro("FORCE_INLINE")
+#pragma push_macro("ALIGN_STRUCT")
+#define FORCE_INLINE static inline __attribute__((always_inline))
+#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
+#ifndef likely
+#define likely(x) __builtin_expect(!!(x), 1)
+#endif
+#ifndef unlikely
+#define unlikely(x) __builtin_expect(!!(x), 0)
+#endif
+#else
+#error "Macro name collisions may happen with unsupported compiler."
+#ifdef FORCE_INLINE
+#undef FORCE_INLINE
+#endif
+#define FORCE_INLINE static inline
+#ifndef ALIGN_STRUCT
+#define ALIGN_STRUCT(x) __declspec(align(x))
+#endif
+#endif
+#ifndef likely
+#define likely(x) (x)
+#endif
+#ifndef unlikely
+#define unlikely(x) (x)
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/* Architecture-specific build options */
+/* FIXME: #pragma GCC push_options is only available on GCC */
+#if defined(__GNUC__)
+#if defined(__arm__) && __ARM_ARCH == 7
+/* According to ARM C Language Extensions Architecture specification,
+ * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
+ * architecture supported.
+ */
+#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
+#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
+#endif
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("fpu=neon")
+#endif
+#elif defined(__aarch64__)
+#if !defined(__clang__)
+#pragma GCC push_options
+#pragma GCC target("+simd")
+#endif
+#else
+#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
+#endif
+#endif
+
+#include <arm_neon.h>
+
+/* Rounding functions require either Aarch64 instructions or libm failback */
+#if !defined(__aarch64__)
+#include <math.h>
+#endif
+
+/* "__has_builtin" can be used to query support for built-in functions
+ * provided by gcc/clang and other compilers that support it.
+ */
+#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
+/* Compatibility with gcc <= 9 */
+#if __GNUC__ <= 9
+#define __has_builtin(x) HAS##x
+#define HAS__builtin_popcount 1
+#define HAS__builtin_popcountll 1
+#else
+#define __has_builtin(x) 0
+#endif
+#endif
+
+/**
+ * MACRO for shuffle parameter for _mm_shuffle_ps().
+ * Argument fp3 is a digit[0123] that represents the fp from argument "b"
+ * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
+ * for fp2 in result. fp1 is a digit[0123] that represents the fp from
+ * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
+ * fp0 is the same for fp0 of result.
+ */
+#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
+    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
+
+/* Rounding mode macros. */
+#define _MM_FROUND_TO_NEAREST_INT 0x00
+#define _MM_FROUND_TO_NEG_INF 0x01
+#define _MM_FROUND_TO_POS_INF 0x02
+#define _MM_FROUND_TO_ZERO 0x03
+#define _MM_FROUND_CUR_DIRECTION 0x04
+#define _MM_FROUND_NO_EXC 0x08
+#define _MM_ROUND_NEAREST 0x0000
+#define _MM_ROUND_DOWN 0x2000
+#define _MM_ROUND_UP 0x4000
+#define _MM_ROUND_TOWARD_ZERO 0x6000
+
+/* indicate immediate constant argument in a given range */
+#define __constrange(a, b) const
+
+/* A few intrinsics accept traditional data types like ints or floats, but
+ * most operate on data types that are specific to SSE.
+ * If a vector type ends in d, it contains doubles, and if it does not have
+ * a suffix, it contains floats. An integer vector type can contain any type
+ * of integer, from chars to shorts to unsigned long longs.
+ */
+typedef int64x1_t __m64;
+typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
+// On ARM 32-bit architecture, the float64x2_t is not supported.
+// The data type __m128d should be represented in a different way for related
+// intrinsic conversion.
+#if defined(__aarch64__)
+typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
+#else
+typedef float32x4_t __m128d;
+#endif
+typedef int64x2_t __m128i; /* 128-bit vector containing integers */
+
+/* type-safe casting between types */
+
+#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
+#define vreinterpretq_m128_f32(x) (x)
+#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
+
+#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
+#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
+#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
+#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
+#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
+#define vreinterpretq_f32_m128(x) (x)
+#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
+
+#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
+#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
+#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
+#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
+#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
+#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
+#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
+#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
+#define vreinterpretq_m128i_s64(x) (x)
+
+#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
+#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
+#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
+#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
+
+#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
+#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
+#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
+#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
+#define vreinterpretq_s64_m128i(x) (x)
+
+#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
+#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
+#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
+#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
+
+#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
+#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
+#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
+#define vreinterpret_m64_s64(x) (x)
+
+#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
+#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
+#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
+#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
+
+#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
+#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
+#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
+
+#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
+#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
+#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
+#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
+
+#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
+#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
+#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
+#define vreinterpret_s64_m64(x) (x)
+
+#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
+
+#if defined(__aarch64__)
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
+
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
+
+#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
+#define vreinterpretq_m128d_f64(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
+
+#define vreinterpretq_f64_m128d(x) (x)
+#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
+#else
+#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
+#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
+
+#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
+#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
+
+#define vreinterpretq_m128d_f32(x) (x)
+
+#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
+
+#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
+#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
+
+#define vreinterpretq_f32_m128d(x) (x)
+#endif
+
+// A struct is defined in this header file called 'SIMDVec' which can be used
+// by applications which attempt to access the contents of an _m128 struct
+// directly.  It is important to note that accessing the __m128 struct directly
+// is bad coding practice by Microsoft: @see:
+// https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx
+//
+// However, some legacy source code may try to access the contents of an __m128
+// struct directly so the developer can use the SIMDVec as an alias for it.  Any
+// casting must be done manually by the developer, as you cannot cast or
+// otherwise alias the base NEON data type for intrinsic operations.
+//
+// union intended to allow direct access to an __m128 variable using the names
+// that the MSVC compiler provides.  This union should really only be used when
+// trying to access the members of the vector as integer values.  GCC/clang
+// allow native access to the float members through a simple array access
+// operator (in C since 4.6, in C++ since 4.8).
+//
+// Ideally direct accesses to SIMD vectors should not be used since it can cause
+// a performance hit.  If it really is needed however, the original __m128
+// variable can be aliased with a pointer to this union and used to access
+// individual components.  The use of this union should be hidden behind a macro
+// that is used throughout the codebase to access the members instead of always
+// declaring this type of variable.
+typedef union ALIGN_STRUCT(16) SIMDVec {
+    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
+    int8_t m128_i8[16];    // as signed 8-bit integers.
+    int16_t m128_i16[8];   // as signed 16-bit integers.
+    int32_t m128_i32[4];   // as signed 32-bit integers.
+    int64_t m128_i64[2];   // as signed 64-bit integers.
+    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
+    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
+    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
+    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
+} SIMDVec;
+
+// casting using SIMDVec
+#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
+#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
+#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
+
+/* Backwards compatibility for compilers with lack of specific type support */
+
+// Older gcc does not define vld1q_u8_x4 type
+#if defined(__GNUC__) && !defined(__clang__) &&   \
+    ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \
+     (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) ||  \
+     (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7)
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    uint8x16x4_t ret;
+    ret.val[0] = vld1q_u8(p + 0);
+    ret.val[1] = vld1q_u8(p + 16);
+    ret.val[2] = vld1q_u8(p + 32);
+    ret.val[3] = vld1q_u8(p + 48);
+    return ret;
+}
+#else
+// Wraps vld1q_u8_x4
+FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
+{
+    return vld1q_u8_x4(p);
+}
+#endif
+
+/* Function Naming Conventions
+ * The naming convention of SSE intrinsics is straightforward. A generic SSE
+ * intrinsic function is given as follows:
+ *   _mm_<name>_<data_type>
+ *
+ * The parts of this format are given as follows:
+ * 1. <name> describes the operation performed by the intrinsic
+ * 2. <data_type> identifies the data type of the function's primary arguments
+ *
+ * This last part, <data_type>, is a little complicated. It identifies the
+ * content of the input values, and can be set to any of the following values:
+ * + ps - vectors contain floats (ps stands for packed single-precision)
+ * + pd - vectors cantain doubles (pd stands for packed double-precision)
+ * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            signed integers
+ * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
+ *                            unsigned integers
+ * + si128 - unspecified 128-bit vector or 256-bit vector
+ * + m128/m128i/m128d - identifies input vector types when they are different
+ *                      than the type of the returned vector
+ *
+ * For example, _mm_setzero_ps. The _mm implies that the function returns
+ * a 128-bit vector. The _ps at the end implies that the argument vectors
+ * contain floats.
+ *
+ * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
+ *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
+ *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ *   // Set packed 8-bit integers
+ *   // 128 bits, 16 chars, per 8 bits
+ *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
+ *                                  4, 5, 12, 13, 6, 7, 14, 15);
+ *   // Shuffle packed 8-bit integers
+ *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
+ *
+ * Data (Number, Binary, Byte Index):
+    +------+------+-------------+------+------+-------------+
+    |      1      |      2      |      3      |      4      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    0 |    1 |    2 |    3 |    4 |    5 |    6 |    7 | Index
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |      5      |      6      |      7      |      8      | Number
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |    8 |    9 |   10 |   11 |   12 |   13 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+ * Index (Byte Index):
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 |
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 |
+    +------+------+------+------+------+------+------+------+
+ * Result:
+    +------+------+------+------+------+------+------+------+
+    |    1 |    0 |    2 |    3 |    8 |    9 |   10 |   11 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary
+    +------+------+------+------+------+------+------+------+
+    |     256     |      2      |      5      |      6      | Number
+    +------+------+------+------+------+------+------+------+
+
+    +------+------+------+------+------+------+------+------+
+    |    4 |    5 |   12 |   13 |    6 |    7 |   14 |   15 | Index
+    +------+------+------+------+------+------+------+------+
+    | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary
+    +------+------+------+------+------+------+------+------+
+    |      3      |      7      |      4      |      8      | Number
+    +------+------+------+------+------+------+-------------+
+ */
+
+/* Set/get methods */
+
+/* Constants for use with _mm_prefetch.  */
+enum _mm_hint {
+    _MM_HINT_NTA = 0,  /* load data to L1 and L2 cache, mark it as NTA */
+    _MM_HINT_T0 = 1,   /* load data to L1 and L2 cache */
+    _MM_HINT_T1 = 2,   /* load data to L2 cache only */
+    _MM_HINT_T2 = 3,   /* load data to L2 cache only, mark it as NTA */
+    _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */
+    _MM_HINT_ET0 = 5,  /* exclusive version of _MM_HINT_T0 */
+    _MM_HINT_ET1 = 6,  /* exclusive version of _MM_HINT_T1 */
+    _MM_HINT_ET2 = 7   /* exclusive version of _MM_HINT_T2 */
+};
+
+// Loads one cache line of data from address p to a location closer to the
+// processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx
+FORCE_INLINE void _mm_prefetch(const void *p, int i)
+{
+    (void) i;
+    __builtin_prefetch(p);
+}
+
+// Pause the processor. This is typically used in spin-wait loops and depending
+// on the x86 processor typical values are in the 40-100 cycle range. The
+// 'yield' instruction isn't a good fit beacuse it's effectively a nop on most
+// Arm cores. Experience with several databases has shown has shown an 'isb' is
+// a reasonable approximation.
+FORCE_INLINE void _mm_pause()
+{
+    __asm__ __volatile__("isb\n");
+}
+
+// Copy the lower single-precision (32-bit) floating-point element of a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32
+FORCE_INLINE float _mm_cvtss_f32(__m128 a)
+{
+    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in b to a
+// double-precision (64-bit) floating-point element, store the result in the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := Convert_FP32_To_FP64(b[31:0])
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd
+FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
+{
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_s64(
+        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
+#endif
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32
+#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64
+FORCE_INLINE int _mm_cvtss_si64(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(
+        vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0);
+#else
+    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t diff = data - floor(data);
+    if (diff > 0.5)
+        return (int64_t) ceil(data);
+    if (unlikely(diff == 0.5)) {
+        int64_t f = (int64_t) floor(data);
+        int64_t c = (int64_t) ceil(data);
+        return c & 1 ? f : c;
+    }
+    return (int64_t) floor(data);
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi
+FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
+{
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si
+FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
+{
+    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers with truncation, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := 32*j
+//      dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32
+#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer with truncation, and store the result in dst.
+//
+//   dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32
+#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64
+FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
+{
+    return vgetq_lane_s64(
+        vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0);
+}
+
+// Sets the 128-bit value to zero
+// https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_setzero_si128(void)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
+}
+
+// Clears the four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setzero_ps(void)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(0));
+}
+
+// Return vector of type __m128d with all elements set to zero.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd
+FORCE_INLINE __m128d _mm_setzero_pd(void)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
+#else
+    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
+#endif
+}
+
+// Sets the four single-precision, floating-point values to w.
+//
+//   r0 := r1 := r2 := r3 := w
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set1_ps(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to w.
+// https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps1(float _w)
+{
+    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs.
+// https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Copy single-precision (32-bit) floating-point element a to the lower element
+// of dst, and zero the upper 3 elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss
+FORCE_INLINE __m128 _mm_set_ss(float a)
+{
+    float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the four single-precision, floating-point values to the four inputs in
+// reverse order.
+// https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
+{
+    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+}
+
+// Sets the 8 signed 16-bit integer values in reverse order.
+//
+// Return Value
+//   r0 := w0
+//   r1 := w1
+//   ...
+//   r7 := w7
+FORCE_INLINE __m128i _mm_setr_epi16(short w0,
+                                    short w1,
+                                    short w2,
+                                    short w3,
+                                    short w4,
+                                    short w5,
+                                    short w6,
+                                    short w7)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
+    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
+}
+
+// Sets the 4 signed 32-bit integer values in reverse order
+// https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Set packed 64-bit integers in dst with the supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64
+FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
+{
+    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
+}
+
+// Sets the 16 signed 8-bit integer values to b.
+//
+//   r0 := b
+//   r1 := b
+//   ...
+//   r15 := b
+//
+// https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
+{
+    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd
+FORCE_INLINE __m128d _mm_set1_pd(double d)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
+#endif
+}
+
+// Sets the 8 signed 16-bit integer values to w.
+//
+//   r0 := w
+//   r1 := w
+//   ...
+//   r7 := w
+//
+// https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set1_epi16(short w)
+{
+    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
+}
+
+// Sets the 16 signed 8-bit integer values.
+// https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
+                                  signed char b14,
+                                  signed char b13,
+                                  signed char b12,
+                                  signed char b11,
+                                  signed char b10,
+                                  signed char b9,
+                                  signed char b8,
+                                  signed char b7,
+                                  signed char b6,
+                                  signed char b5,
+                                  signed char b4,
+                                  signed char b3,
+                                  signed char b2,
+                                  signed char b1,
+                                  signed char b0)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 8 signed 16-bit integer values.
+// https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_set_epi16(short i7,
+                                   short i6,
+                                   short i5,
+                                   short i4,
+                                   short i3,
+                                   short i2,
+                                   short i1,
+                                   short i0)
+{
+    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
+    return vreinterpretq_m128i_s16(vld1q_s16(data));
+}
+
+// Sets the 16 signed 8-bit integer values in reverse order.
+// https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
+                                   signed char b1,
+                                   signed char b2,
+                                   signed char b3,
+                                   signed char b4,
+                                   signed char b5,
+                                   signed char b6,
+                                   signed char b7,
+                                   signed char b8,
+                                   signed char b9,
+                                   signed char b10,
+                                   signed char b11,
+                                   signed char b12,
+                                   signed char b13,
+                                   signed char b14,
+                                   signed char b15)
+{
+    int8_t ALIGN_STRUCT(16)
+        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
+                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
+                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
+                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
+    return (__m128i) vld1q_s8(data);
+}
+
+// Sets the 4 signed 32-bit integer values to i.
+//
+//   r0 := i
+//   r1 := i
+//   r2 := i
+//   r3 := I
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set1_epi32(int _i)
+{
+    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100)
+FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i));
+}
+
+// Sets the 2 signed 64-bit integer values to i.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x
+FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
+{
+    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
+}
+
+// Sets the 4 signed 32-bit integer values.
+// https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
+{
+    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
+    return vreinterpretq_m128i_s32(vld1q_s32(data));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
+}
+
+// Returns the __m128i structure with its two 64-bit integer values
+// initialized to the values of the two 64-bit integers passed in.
+// https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
+{
+    return _mm_set_epi64x((int64_t) i1, (int64_t) i2);
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd
+FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
+{
+    double ALIGN_STRUCT(16) data[2] = {e0, e1};
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
+#else
+    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
+#endif
+}
+
+// Set packed double-precision (64-bit) floating-point elements in dst with the
+// supplied values in reverse order.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd
+FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
+{
+    return _mm_set_pd(e0, e1);
+}
+
+// Copy double-precision (64-bit) floating-point element a to the lower element
+// of dst, and zero the upper element.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd
+FORCE_INLINE __m128d _mm_set_sd(double a)
+{
+    return _mm_set_pd(0, a);
+}
+
+// Broadcast double-precision (64-bit) floating-point value a to all elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1
+#define _mm_set_pd1 _mm_set1_pd
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1
+FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    vst1q_f32(p, vdupq_n_f32(a0));
+}
+
+// Store the lower single-precision (32-bit) floating-point element from a into
+// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[31:0]
+//   MEM[mem_addr+63:mem_addr+32] := a[31:0]
+//   MEM[mem_addr+95:mem_addr+64] := a[31:0]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps
+#define _mm_store1_ps _mm_store_ps1
+
+// Store 4 single-precision (32-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+31:mem_addr] := a[127:96]
+//   MEM[mem_addr+63:mem_addr+32] := a[95:64]
+//   MEM[mem_addr+95:mem_addr+64] := a[63:32]
+//   MEM[mem_addr+127:mem_addr+96] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps
+FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
+{
+    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
+    float32x4_t rev = vextq_f32(tmp, tmp, 2);
+    vst1q_f32(p, rev);
+}
+
+// Stores four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx
+FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
+{
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+}
+
+// Stores four 32-bit integer values as (as a __m128i value) at the address p.
+// https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx
+FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores 128-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si128
+FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
+{
+    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
+}
+
+// Stores 64-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si64
+FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
+{
+    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
+}
+
+// Stores 32-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si32
+FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
+{
+    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
+}
+
+// Stores 16-bits of integer data a at the address p.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_si16
+FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
+{
+    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
+}
+
+// Stores the lower single - precision, floating - point value.
+// https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx
+FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
+{
+    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
+// or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd
+FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
+#endif
+}
+
+// Store the upper double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd
+FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory.
+//
+//   MEM[mem_addr+63:mem_addr] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd
+FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
+#endif
+}
+
+// Store 2 double-precision (64-bit) floating-point elements from a into memory
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   MEM[mem_addr+63:mem_addr] := a[127:64]
+//   MEM[mem_addr+127:mem_addr+64] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd
+FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
+{
+    float32x4_t f = vreinterpretq_f32_m128d(a);
+    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1
+FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
+    vst1q_f64((float64_t *) mem_addr,
+              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
+#else
+    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
+    vst1q_f32((float32_t *) mem_addr,
+              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// memory. mem_addr does not need to be aligned on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_store_sd
+FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
+{
+#if defined(__aarch64__)
+    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
+#else
+    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
+#endif
+}
+
+// Store the lower double-precision (64-bit) floating-point element from a into
+// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
+// boundary or a general-protection exception may be generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd
+#define _mm_store1_pd _mm_store_pd1
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory. mem_addr does not need to be aligned on any
+// particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd
+FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
+{
+    _mm_store_pd(mem_addr, a);
+}
+
+// Reads the lower 64 bits of b and stores them into the lower 64 bits of a.
+// https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
+{
+    uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a));
+    uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b));
+    *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi));
+}
+
+// Stores the lower two single-precision floating point values of a to the
+// address p.
+//
+//   *p0 := a0
+//   *p1 := a1
+//
+// https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx
+FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_low_f32(a));
+}
+
+// Stores the upper two single-precision, floating-point values of a to the
+// address p.
+//
+//   *p0 := a2
+//   *p1 := a3
+//
+// https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx
+FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
+{
+    *p = vreinterpret_m64_f32(vget_high_f32(a));
+}
+
+// Loads a single single-precision, floating-point value, copying it into all
+// four words
+// https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load1_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
+}
+
+// Load a single-precision (32-bit) floating-point element from memory into all
+// elements of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[63:32] := MEM[mem_addr+31:mem_addr]
+//   dst[95:64] := MEM[mem_addr+31:mem_addr]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1
+#define _mm_load_ps1 _mm_load1_ps
+
+// Sets the lower two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the upper two values are passed
+// through from a.
+//
+// Return Value
+//   r0 := *p0
+//   r1 := *p1
+//   r2 := a2
+//   r3 := a3
+//
+// https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
+}
+
+// Load 4 single-precision (32-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[31:0] := MEM[mem_addr+127:mem_addr+96]
+//   dst[63:32] := MEM[mem_addr+95:mem_addr+64]
+//   dst[95:64] := MEM[mem_addr+63:mem_addr+32]
+//   dst[127:96] := MEM[mem_addr+31:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps
+FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
+{
+    float32x4_t v = vrev64q_f32(vld1q_f32(p));
+    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
+}
+
+// Sets the upper two single-precision, floating-point values with 64
+// bits of data loaded from the address p; the lower two values are passed
+// through from a.
+//
+//   r0 := a0
+//   r1 := a1
+//   r2 := *p0
+//   r3 := *p1
+//
+// https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx
+FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_load_ps(const float *p)
+{
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Loads four single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
+{
+    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
+    // equivalent for neon
+    return vreinterpretq_m128_f32(vld1q_f32(p));
+}
+
+// Load unaligned 16-bit integer from memory into the first element of dst.
+//
+//   dst[15:0] := MEM[mem_addr+15:mem_addr]
+//   dst[MAX:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16
+FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
+{
+    return vreinterpretq_m128i_s16(
+        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
+}
+
+// Load unaligned 64-bit integer from memory into the first element of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[MAX:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64
+FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower of dst, and zero the upper element. mem_addr does not need to be
+// aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd
+FORCE_INLINE __m128d _mm_load_sd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from 16-byte aligned memory, floating-point
+// values.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd
+FORCE_INLINE __m128d _mm_load_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_f64(p));
+#else
+    const float *fp = (const float *) p;
+    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
+    return vreinterpretq_m128d_f32(vld1q_f32(data));
+#endif
+}
+
+// Loads two double-precision from unaligned memory, floating-point values.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd
+FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
+{
+    return _mm_load_pd(p);
+}
+
+// Loads an single - precision, floating - point value into the low word and
+// clears the upper three words.
+// https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_load_ss(const float *p)
+{
+    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
+}
+
+// Load 64-bit integer from memory into the first element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64
+FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
+{
+    /* Load the lower 64 bits of the value pointed to by p into the
+     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
+     */
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// lower element of dst, and copy the upper element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd
+FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
+#else
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vld1_f32((const float *) p),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+#endif
+}
+
+// Load 2 double-precision (64-bit) floating-point elements from memory into dst
+// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[63:0] := MEM[mem_addr+127:mem_addr+64]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd
+FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
+{
+#if defined(__aarch64__)
+    float64x2_t v = vld1q_f64(p);
+    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
+#else
+    int64x2_t v = vld1q_s64((const int64_t *) p);
+    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
+#endif
+}
+
+// Sets the low word to the single-precision, floating-point value of b
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100)
+FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
+                       vreinterpretq_f32_m128(a), 0));
+}
+
+// Move the lower double-precision (64-bit) floating-point element from b to the
+// lower element of dst, and copy the upper element from a to the upper element
+// of dst.
+//
+//   dst[63:0] := b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd
+FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_f32(
+        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
+                     vget_high_f32(vreinterpretq_f32_m128d(a))));
+}
+
+// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
+// upper element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64
+FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
+}
+
+// Return vector of type __m128 with undefined elements.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps
+FORCE_INLINE __m128 _mm_undefined_ps(void)
+{
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wuninitialized"
+#endif
+    __m128 a;
+    return a;
+#if defined(__GNUC__) || defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
+}
+
+/* Logic/Binary operations */
+
+// Computes the bitwise AND-NOT of the four single-precision, floating-point
+// values of a and b.
+//
+//   r0 := ~a0 & b0
+//   r1 := ~a1 & b1
+//   r2 := ~a2 & b2
+//   r3 := ~a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vbicq_s32(vreinterpretq_s32_m128(b),
+                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
+}
+
+// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
+// elements in a and then AND with b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+// 	     i := j*64
+// 	     dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd
+FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
+{
+    // *NOTE* argument swap
+    return vreinterpretq_m128d_s64(
+        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
+}
+
+// Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the
+// 128-bit value in a.
+//
+//   r := (~a) & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vbicq_s32(vreinterpretq_s32_m128i(b),
+                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
+}
+
+// Computes the bitwise AND of the 128-bit value in a and the 128-bit value in
+// b.
+//
+//   r := a & b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise AND of the four single-precision, floating-point values
+// of a and b.
+//
+//   r0 := a0 & b0
+//   r1 := a1 & b1
+//   r2 := a2 & b2
+//   r3 := a3 & b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise AND of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] AND b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd
+FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the four single-precision, floating-point values
+// of a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Computes bitwise EXOR (exclusive-or) of the four single-precision,
+// floating-point values of a and b.
+// https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_s32(
+        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
+}
+
+// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//      i := j*64
+//      dst[i+63:i] := a[i+63:i] XOR b[i+63:i]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd
+FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Compute the bitwise OR of packed double-precision (64-bit) floating-point
+// elements in a and b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd
+FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
+{
+    return vreinterpretq_m128d_s64(
+        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
+}
+
+// Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b.
+//
+//   r := a | b
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in
+// b.  https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Duplicate the low double-precision (64-bit) floating-point element from a,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd
+FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
+{
+#if (__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
+#else
+    return vreinterpretq_m128d_u64(
+        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
+#endif
+}
+
+// Duplicate odd-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps
+FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
+#else
+    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
+    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Duplicate even-indexed single-precision (32-bit) floating-point elements
+// from a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps
+FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
+{
+#if __has_builtin(__builtin_shufflevector)
+    return vreinterpretq_m128_f32(__builtin_shufflevector(
+        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
+#else
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
+    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
+    return vreinterpretq_m128_f32(vld1q_f32(data));
+#endif
+}
+
+// Moves the upper two values of B into the lower two values of A.
+//
+//   r3 := a3
+//   r2 := a2
+//   r1 := b3
+//   r0 := b2
+FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
+}
+
+// Moves the lower two values of B into the upper two values of A.
+//
+//   r3 := b1
+//   r2 := b0
+//   r1 := a1
+//   r0 := a0
+FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pi8
+FORCE_INLINE int _mm_movemask_pi8(__m64 a)
+{
+    uint8x8_t input = vreinterpret_u8_m64(a);
+#if defined(__aarch64__)
+    static const int8x8_t shift = {0, 1, 2, 3, 4, 5, 6, 7};
+    uint8x8_t tmp = vshr_n_u8(input, 7);
+    return vaddv_u8(vshl_u8(tmp, shift));
+#else
+    // Refer the implementation of `_mm_movemask_epi8`
+    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
+    uint32x2_t paired16 =
+        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
+    uint8x8_t paired32 =
+        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
+    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
+#endif
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32
+FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16
+FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
+{
+    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8
+FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
+{
+    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
+}
+
+// Compute the absolute value of packed signed 32-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     dst[i+31:i] := ABS(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32
+FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
+{
+    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
+}
+
+// Compute the absolute value of packed signed 16-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := ABS(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16
+FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
+{
+    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
+}
+
+// Compute the absolute value of packed signed 8-bit integers in a, and store
+// the unsigned results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := ABS(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8
+FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
+{
+    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
+}
+
+// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 16 bytes in dst.
+//
+//   tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8)
+//   dst[127:0] := tmp[127:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8
+#define _mm_alignr_epi8(a, b, imm)                                            \
+    __extension__({                                                           \
+        __m128i ret;                                                          \
+        if (unlikely((imm) >= 32)) {                                          \
+            ret = _mm_setzero_si128();                                        \
+        } else {                                                              \
+            uint8x16_t tmp_low, tmp_high;                                     \
+            if (imm >= 16) {                                                  \
+                const int idx = imm - 16;                                     \
+                tmp_low = vreinterpretq_u8_m128i(a);                          \
+                tmp_high = vdupq_n_u8(0);                                     \
+                ret =                                                         \
+                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                          \
+                const int idx = imm;                                          \
+                tmp_low = vreinterpretq_u8_m128i(b);                          \
+                tmp_high = vreinterpretq_u8_m128i(a);                         \
+                ret =                                                         \
+                    vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \
+            }                                                                 \
+        }                                                                     \
+        ret;                                                                  \
+    })
+
+// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
+// the result right by imm8 bytes, and store the low 8 bytes in dst.
+//
+//   tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8)
+//   dst[63:0] := tmp[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8
+#define _mm_alignr_pi8(a, b, imm)                                           \
+    __extension__({                                                         \
+        __m64 ret;                                                          \
+        if (unlikely((imm) >= 16)) {                                        \
+            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
+        } else {                                                            \
+            uint8x8_t tmp_low, tmp_high;                                    \
+            if (imm >= 8) {                                                 \
+                const int idx = imm - 8;                                    \
+                tmp_low = vreinterpret_u8_m64(a);                           \
+                tmp_high = vdup_n_u8(0);                                    \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            } else {                                                        \
+                const int idx = imm;                                        \
+                tmp_low = vreinterpret_u8_m64(b);                           \
+                tmp_high = vreinterpret_u8_m64(a);                          \
+                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
+            }                                                               \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of b and places it into the high end of the result.
+FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in high
+// end of result takes the higher two 32 bit values from b and swaps them and
+// places in low end of result.
+FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
+{
+    float32x2_t a21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
+{
+    float32x2_t a03 = vget_low_f32(
+        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
+    float32x2_t b21 = vget_high_f32(
+        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
+    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
+}
+
+// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
+// high
+FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
+{
+    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
+{
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
+{
+    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t b22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
+    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
+{
+    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32x2_t a22 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
+    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
+{
+    float32x2_t a33 =
+        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
+    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
+{
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
+{
+    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
+}
+
+FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
+{
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32_t b2 = vgetq_lane_f32(b, 2);
+    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
+    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
+    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
+}
+
+// NEON does not support a general purpose permute intrinsic
+// Selects four specific single-precision, floating-point values from a and b,
+// based on the mask i.
+//
+// C equivalent:
+//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
+//                                 __constrange(0, 255) int imm) {
+//       __m128 ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx
+#define _mm_shuffle_ps_default(a, b, imm)                                  \
+    __extension__({                                                        \
+        float32x4_t ret;                                                   \
+        ret = vmovq_n_f32(                                                 \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                       \
+        ret = vsetq_lane_f32(                                              \
+            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                       \
+        vreinterpretq_m128_f32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
+// int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_ps(a, b, imm)                                \
+    __extension__({                                              \
+        float32x4_t _input1 = vreinterpretq_f32_m128(a);         \
+        float32x4_t _input2 = vreinterpretq_f32_m128(b);         \
+        float32x4_t _shuf = __builtin_shufflevector(             \
+            _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
+        vreinterpretq_m128_f32(_shuf);                           \
+    })
+#else  // generic
+#define _mm_shuffle_ps(a, b, imm)                          \
+    __extension__({                                        \
+        __m128 ret;                                        \
+        switch (imm) {                                     \
+        case _MM_SHUFFLE(1, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_1032((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 3, 0, 1):                      \
+            ret = _mm_shuffle_ps_2301((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 3, 2, 1):                      \
+            ret = _mm_shuffle_ps_0321((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 1, 0, 3):                      \
+            ret = _mm_shuffle_ps_2103((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 1, 0):                      \
+            ret = _mm_movelh_ps((a), (b));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_1001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 1, 0, 1):                      \
+            ret = _mm_shuffle_ps_0101((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 1, 0):                      \
+            ret = _mm_shuffle_ps_3210((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 1, 1):                      \
+            ret = _mm_shuffle_ps_0011((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(0, 0, 2, 2):                      \
+            ret = _mm_shuffle_ps_0022((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 2, 0, 0):                      \
+            ret = _mm_shuffle_ps_2200((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 0, 2):                      \
+            ret = _mm_shuffle_ps_3202((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(3, 2, 3, 2):                      \
+            ret = _mm_movehl_ps((b), (a));                 \
+            break;                                         \
+        case _MM_SHUFFLE(1, 1, 3, 3):                      \
+            ret = _mm_shuffle_ps_1133((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 1, 0):                      \
+            ret = _mm_shuffle_ps_2010((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 0, 1):                      \
+            ret = _mm_shuffle_ps_2001((a), (b));           \
+            break;                                         \
+        case _MM_SHUFFLE(2, 0, 3, 2):                      \
+            ret = _mm_shuffle_ps_2032((a), (b));           \
+            break;                                         \
+        default:                                           \
+            ret = _mm_shuffle_ps_default((a), (b), (imm)); \
+            break;                                         \
+        }                                                  \
+        ret;                                               \
+    })
+#endif
+
+// Takes the upper 64 bits of a and places it in the low end of the result
+// Takes the lower 64 bits of a and places it into the high end of the result.
+FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
+}
+
+// takes the lower two 32-bit values from a and swaps them and places in low end
+// of result takes the higher two 32 bit values from a and swaps them and places
+// in high end of result.
+FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
+}
+
+// rotates the least significant 32 bits into the most signficant 32 bits, and
+// shifts the rest down
+FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
+}
+
+// rotates the most significant 32 bits into the least signficant 32 bits, and
+// shifts the rest up
+FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
+}
+
+// gets the lower 64 bits of a, and places it in the upper 64 bits
+// gets the lower 64 bits of a and places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
+{
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
+// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
+}
+
+// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
+// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
+// places it in the lower 64 bits
+FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
+{
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
+{
+    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
+{
+    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
+    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
+    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
+}
+
+FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
+{
+    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
+    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
+}
+
+// Shuffle packed 8-bit integers in a according to shuffle control mask in the
+// corresponding 8-bit element of b, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8
+FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
+{
+    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
+    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
+    uint8x16_t idx_masked =
+        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
+#elif defined(__GNUC__)
+    int8x16_t ret;
+    // %e and %f represent the even and odd D registers
+    // respectively.
+    __asm__ __volatile__(
+        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
+        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
+        : [ret] "=&w"(ret)
+        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
+    return vreinterpretq_m128i_s8(ret);
+#else
+    // use this line if testing on aarch64
+    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
+                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
+#endif
+}
+
+// C equivalent:
+//   __m128i _mm_shuffle_epi32_default(__m128i a,
+//                                     __constrange(0, 255) int imm) {
+//       __m128i ret;
+//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
+//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
+//       return ret;
+//   }
+#define _mm_shuffle_epi32_default(a, imm)                                   \
+    __extension__({                                                         \
+        int32x4_t ret;                                                      \
+        ret = vmovq_n_s32(                                                  \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3)));     \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \
+            ret, 1);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
+            ret, 2);                                                        \
+        ret = vsetq_lane_s32(                                               \
+            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
+            ret, 3);                                                        \
+        vreinterpretq_m128i_s32(ret);                                       \
+    })
+
+// FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255)
+// int imm)
+#if defined(__aarch64__)
+#define _mm_shuffle_epi32_splat(a, imm)                          \
+    __extension__({                                              \
+        vreinterpretq_m128i_s32(                                 \
+            vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \
+    })
+#else
+#define _mm_shuffle_epi32_splat(a, imm)                                      \
+    __extension__({                                                          \
+        vreinterpretq_m128i_s32(                                             \
+            vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \
+    })
+#endif
+
+// Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm.
+// https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
+//                                        __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_epi32(a, imm)                              \
+    __extension__({                                            \
+        int32x4_t _input = vreinterpretq_s32_m128i(a);         \
+        int32x4_t _shuf = __builtin_shufflevector(             \
+            _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
+            ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
+        vreinterpretq_m128i_s32(_shuf);                        \
+    })
+#else  // generic
+#define _mm_shuffle_epi32(a, imm)                        \
+    __extension__({                                      \
+        __m128i ret;                                     \
+        switch (imm) {                                   \
+        case _MM_SHUFFLE(1, 0, 3, 2):                    \
+            ret = _mm_shuffle_epi_1032((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 3, 0, 1):                    \
+            ret = _mm_shuffle_epi_2301((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 3, 2, 1):                    \
+            ret = _mm_shuffle_epi_0321((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 1, 0, 3):                    \
+            ret = _mm_shuffle_epi_2103((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 1, 0):                    \
+            ret = _mm_shuffle_epi_1010((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(1, 0, 0, 1):                    \
+            ret = _mm_shuffle_epi_1001((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 0, 1):                    \
+            ret = _mm_shuffle_epi_0101((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 1, 1):                    \
+            ret = _mm_shuffle_epi_2211((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 1, 2, 2):                    \
+            ret = _mm_shuffle_epi_0122((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 2):                    \
+            ret = _mm_shuffle_epi_3332((a));             \
+            break;                                       \
+        case _MM_SHUFFLE(0, 0, 0, 0):                    \
+            ret = _mm_shuffle_epi32_splat((a), 0);       \
+            break;                                       \
+        case _MM_SHUFFLE(1, 1, 1, 1):                    \
+            ret = _mm_shuffle_epi32_splat((a), 1);       \
+            break;                                       \
+        case _MM_SHUFFLE(2, 2, 2, 2):                    \
+            ret = _mm_shuffle_epi32_splat((a), 2);       \
+            break;                                       \
+        case _MM_SHUFFLE(3, 3, 3, 3):                    \
+            ret = _mm_shuffle_epi32_splat((a), 3);       \
+            break;                                       \
+        default:                                         \
+            ret = _mm_shuffle_epi32_default((a), (imm)); \
+            break;                                       \
+        }                                                \
+        ret;                                             \
+    })
+#endif
+
+// Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100)
+// FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflelo_epi16_function(a, imm)                                  \
+    __extension__({                                                           \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                           \
+        int16x4_t lowBits = vget_low_s16(ret);                                \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
+                             1);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
+                             2);                                              \
+        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
+                             3);                                              \
+        vreinterpretq_m128i_s16(ret);                                         \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflelo_epi16(a, imm)                                  \
+    __extension__({                                                  \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
+        int16x8_t _shuf = __builtin_shufflevector(                   \
+            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
+            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
+        vreinterpretq_m128i_s16(_shuf);                              \
+    })
+#else  // generic
+#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
+#endif
+
+// Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified
+// by imm.
+// https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a,
+//                                                   __constrange(0,255) int
+//                                                   imm)
+#define _mm_shufflehi_epi16_function(a, imm)                                   \
+    __extension__({                                                            \
+        int16x8_t ret = vreinterpretq_s16_m128i(a);                            \
+        int16x4_t highBits = vget_high_s16(ret);                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
+                             5);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
+                             6);                                               \
+        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
+                             7);                                               \
+        vreinterpretq_m128i_s16(ret);                                          \
+    })
+
+// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
+//                                          __constrange(0,255) int imm)
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shufflehi_epi16(a, imm)                             \
+    __extension__({                                             \
+        int16x8_t _input = vreinterpretq_s16_m128i(a);          \
+        int16x8_t _shuf = __builtin_shufflevector(              \
+            _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
+            (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
+            (((imm) >> 6) & 0x3) + 4);                          \
+        vreinterpretq_m128i_s16(_shuf);                         \
+    })
+#else  // generic
+#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
+#endif
+
+// Shuffle double-precision (64-bit) floating-point elements using the control
+// in imm8, and store the results in dst.
+//
+//   dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64]
+//   dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd
+#if __has_builtin(__builtin_shufflevector)
+#define _mm_shuffle_pd(a, b, imm8)                                          \
+    vreinterpretq_m128d_s64(__builtin_shufflevector(                        \
+        vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \
+        ((imm8 & 0x2) >> 1) + 2))
+#else
+#define _mm_shuffle_pd(a, b, imm8)                                     \
+    _mm_castsi128_pd(_mm_set_epi64x(                                   \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
+        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
+#endif
+
+// Blend packed 16-bit integers from a and b using control mask imm8, and store
+// the results in dst.
+//
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF imm8[j]
+//           dst[i+15:i] := b[i+15:i]
+//       ELSE
+//           dst[i+15:i] := a[i+15:i]
+//       FI
+//   ENDFOR
+// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
+//                                      __constrange(0,255) int imm)
+#define _mm_blend_epi16(a, b, imm)                                            \
+    __extension__({                                                           \
+        const uint16_t _mask[8] = {((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,  \
+                                   ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0}; \
+        uint16x8_t _mask_vec = vld1q_u16(_mask);                              \
+        uint16x8_t _a = vreinterpretq_u16_m128i(a);                           \
+        uint16x8_t _b = vreinterpretq_u16_m128i(b);                           \
+        vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a));                \
+    })
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using control mask imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_pd
+#define _mm_blend_pd(a, b, imm)                                \
+    __extension__({                                            \
+        const uint64_t _mask[2] = {                            \
+            ((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),   \
+            ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)};  \
+        uint64x2_t _mask_vec = vld1q_u64(_mask);               \
+        uint64x2_t _a = vreinterpretq_u64_m128d(a);            \
+        uint64x2_t _b = vreinterpretq_u64_m128d(b);            \
+        vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, _b, _a)); \
+    })
+
+// Blend packed 8-bit integers from a and b using mask, and store the results in
+// dst.
+//
+//   FOR j := 0 to 15
+//       i := j*8
+//       IF mask[i+7]
+//           dst[i+7:i] := b[i+7:i]
+//       ELSE
+//           dst[i+7:i] := a[i+7:i]
+//       FI
+//   ENDFOR
+FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint8x16_t mask =
+        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    uint8x16_t b = vreinterpretq_u8_m128i(_b);
+    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
+}
+
+/* Shifts */
+
+
+// Shift packed 16-bit integers in a right by imm while shifting in sign
+// bits, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16
+FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
+{
+    const int count = (imm & ~15) ? 15 : imm;
+    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx
+#define _mm_slli_epi16(a, imm)                                   \
+    __extension__({                                              \
+        __m128i ret;                                             \
+        if (unlikely((imm)) <= 0) {                              \
+            ret = a;                                             \
+        }                                                        \
+        if (unlikely((imm) > 15)) {                              \
+            ret = _mm_setzero_si128();                           \
+        } else {                                                 \
+            ret = vreinterpretq_m128i_s16(                       \
+                vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \
+        }                                                        \
+        ret;                                                     \
+    })
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros. :
+// https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx
+// FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm)
+FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
+{
+    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (unlikely(imm > 31))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s32(
+        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
+}
+
+// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
+// store the results in dst.
+FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
+{
+    if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */
+        return a;
+    if (unlikely(imm > 63))
+        return _mm_setzero_si128();
+    return vreinterpretq_m128i_s64(
+        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
+}
+
+// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 7
+//     i := j*16
+//     IF imm8[7:0] > 15
+//       dst[i+15:i] := 0
+//     ELSE
+//       dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16
+#define _mm_srli_epi16(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely(imm) == 0) {                                          \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 16)) {                             \
+            ret = vreinterpretq_m128i_u16(                                 \
+                vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := 0
+//     ELSE
+//       dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32
+// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 32)) {                             \
+            ret = vreinterpretq_m128i_u32(                                 \
+                vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
+// store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     IF imm8[7:0] > 63
+//       dst[i+63:i] := 0
+//     ELSE
+//       dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64
+#define _mm_srli_epi64(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 64)) {                             \
+            ret = vreinterpretq_m128i_u64(                                 \
+                vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \
+        } else {                                                           \
+            ret = _mm_setzero_si128();                                     \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
+// and store the results in dst.
+//
+//   FOR j := 0 to 3
+//     i := j*32
+//     IF imm8[7:0] > 31
+//       dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0)
+//     ELSE
+//       dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0])
+//     FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32
+// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
+#define _mm_srai_epi32(a, imm)                                             \
+    __extension__({                                                        \
+        __m128i ret;                                                       \
+        if (unlikely((imm) == 0)) {                                        \
+            ret = a;                                                       \
+        }                                                                  \
+        if (likely(0 < (imm) && (imm) < 32)) {                             \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \
+        } else {                                                           \
+            ret = vreinterpretq_m128i_s32(                                 \
+                vshrq_n_s32(vreinterpretq_s32_m128i(a), 31));              \
+        }                                                                  \
+        ret;                                                               \
+    })
+
+// Shifts the 128 - bit value in a right by imm bytes while shifting in
+// zeros.imm must be an immediate.
+//
+//   r := srl(a, imm*8)
+//
+// https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx
+// FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_srli_si128(a, imm)                                              \
+    __extension__({                                                         \
+        __m128i ret;                                                        \
+        if (unlikely((imm) <= 0)) {                                         \
+            ret = a;                                                        \
+        }                                                                   \
+        if (unlikely((imm) > 15)) {                                         \
+            ret = _mm_setzero_si128();                                      \
+        } else {                                                            \
+            ret = vreinterpretq_m128i_s8(                                   \
+                vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \
+        }                                                                   \
+        ret;                                                                \
+    })
+
+// Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm
+// must be an immediate.
+//
+//   r := a << (imm * 8)
+//
+// https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx
+// FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm)
+#define _mm_slli_si128(a, imm)                                          \
+    __extension__({                                                     \
+        __m128i ret;                                                    \
+        if (unlikely((imm) <= 0)) {                                     \
+            ret = a;                                                    \
+        }                                                               \
+        if (unlikely((imm) > 15)) {                                     \
+            ret = _mm_setzero_si128();                                  \
+        } else {                                                        \
+            ret = vreinterpretq_m128i_s8(vextq_s8(                      \
+                vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \
+        }                                                               \
+        ret;                                                            \
+    })
+
+// Compute the square root of packed double-precision (64-bit) floating-point
+// elements in a, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_pd
+FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
+#else
+    double a0 = sqrt(((double *) &a)[0]);
+    double a1 = sqrt(((double *) &a)[1]);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Compute the square root of the lower double-precision (64-bit) floating-point
+// element in b, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sqrt_sd
+FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_sqrt_pd(b));
+#else
+    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
+#endif
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while
+// shifting in zeros.
+//
+//   r0 := a0 << count
+//   r1 := a1 << count
+//   ...
+//   r7 := a7 << count
+//
+// https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16((int16_t) c);
+    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+// r2 := a2 << count
+// r3 := a3 << count
+//
+// https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32((int32_t) c);
+    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while
+// shifting in zeros.
+//
+// r0 := a0 << count
+// r1 := a1 << count
+//
+// https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64((int64_t) c);
+    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
+}
+
+// Shifts the 8 signed or unsigned 16-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// ...
+// r7 := srl(a7, count)
+//
+// https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 15))
+        return _mm_setzero_si128();
+
+    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
+    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
+}
+
+// Shifts the 4 signed or unsigned 32-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+// r2 := srl(a2, count)
+// r3 := srl(a3, count)
+//
+// https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 31))
+        return _mm_setzero_si128();
+
+    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
+    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
+}
+
+// Shifts the 2 signed or unsigned 64-bit integers in a right by count bits
+// while shifting in zeros.
+//
+// r0 := srl(a0, count)
+// r1 := srl(a1, count)
+//
+// https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
+{
+    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
+    if (unlikely(c > 63))
+        return _mm_setzero_si128();
+
+    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
+    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
+}
+
+// NEON does not provide a version of this function.
+// Creates a 16-bit mask from the most significant bits of the 16 signed or
+// unsigned 8-bit integers in a and zero extends the upper bits.
+// https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_epi8(__m128i a)
+{
+    // Use increasingly wide shifts+adds to collect the sign bits
+    // together.
+    // Since the widening shifts would be rather confusing to follow in little
+    // endian, everything will be illustrated in big endian order instead. This
+    // has a different result - the bits would actually be reversed on a big
+    // endian machine.
+
+    // Starting input (only half the elements are shown):
+    // 89 ff 1d c0 00 10 99 33
+    uint8x16_t input = vreinterpretq_u8_m128i(a);
+
+    // Shift out everything but the sign bits with an unsigned shift right.
+    //
+    // Bytes of the vector::
+    // 89 ff 1d c0 00 10 99 33
+    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
+    //  |  |  |  |  |  |  |  |
+    // 01 01 00 01 00 00 01 00
+    //
+    // Bits of first important lane(s):
+    // 10001001 (89)
+    // \______
+    //        |
+    // 00000001 (01)
+    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
+
+    // Merge the even lanes together with a 16-bit unsigned shift right + add.
+    // 'xx' represents garbage data which will be ignored in the final result.
+    // In the important bytes, the add functions like a binary OR.
+    //
+    // 01 01 00 01 00 00 01 00
+    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
+    //    \|    \|    \|    \|
+    // xx 03 xx 01 xx 00 xx 02
+    //
+    // 00000001 00000001 (01 01)
+    //        \_______ |
+    //                \|
+    // xxxxxxxx xxxxxx11 (xx 03)
+    uint32x4_t paired16 =
+        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
+
+    // Repeat with a wider 32-bit shift + add.
+    // xx 03 xx 01 xx 00 xx 02
+    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
+    //     14))
+    //          \|          \|
+    // xx xx xx 0d xx xx xx 02
+    //
+    // 00000011 00000001 (03 01)
+    //        \\_____ ||
+    //         '----.\||
+    // xxxxxxxx xxxx1101 (xx 0d)
+    uint64x2_t paired32 =
+        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
+
+    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
+    // lanes. xx xx xx 0d xx xx xx 02
+    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
+    //            28))
+    //                      \|
+    // xx xx xx xx xx xx xx d2
+    //
+    // 00001101 00000010 (0d 02)
+    //     \   \___ |  |
+    //      '---.  \|  |
+    // xxxxxxxx 11010010 (xx d2)
+    uint8x16_t paired64 =
+        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
+
+    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
+    // xx xx xx xx xx xx xx d2
+    //                      ||  return paired64[0]
+    //                      d2
+    // Note: Little endian would return the correct value 4b (01001011) instead.
+    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
+}
+
+// Set each bit of mask dst based on the most significant bit of the
+// corresponding packed double-precision (64-bit) floating-point element in a.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movemask_pd
+FORCE_INLINE int _mm_movemask_pd(__m128d a)
+{
+    uint64x2_t input = vreinterpretq_u64_m128d(a);
+    uint64x2_t high_bits = vshrq_n_u64(input, 63);
+    return vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64
+FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
+{
+    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
+}
+
+// Copy the 64-bit integer a to the lower element of dst, and zero the upper
+// element.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64
+FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
+{
+    return vreinterpretq_m128i_s64(
+        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
+}
+
+// NEON does not provide this method
+// Creates a 4-bit mask from the most significant bits of the four
+// single-precision, floating-point values.
+// https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx
+FORCE_INLINE int _mm_movemask_ps(__m128 a)
+{
+    uint32x4_t input = vreinterpretq_u32_m128(a);
+#if defined(__aarch64__)
+    static const int32x4_t shift = {0, 1, 2, 3};
+    uint32x4_t tmp = vshrq_n_u32(input, 31);
+    return vaddvq_u32(vshlq_u32(tmp, shift));
+#else
+    // Uses the exact same method as _mm_movemask_epi8, see that for details.
+    // Shift out everything but the sign bits with a 32-bit unsigned shift
+    // right.
+    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
+    // Merge the two pairs together with a 64-bit unsigned shift right + add.
+    uint8x16_t paired =
+        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
+    // Extract the result.
+    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
+#endif
+}
+
+// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
+// all 1's, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones
+FORCE_INLINE int _mm_test_all_ones(__m128i a)
+{
+    return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
+           ~(uint64_t) 0;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and
+// mask, and return 1 if the result is zero, otherwise return 0.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros
+FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
+{
+    int64x2_t a_and_mask =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
+    return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0
+                                                                           : 1;
+}
+
+/* Math operations */
+
+// Subtracts the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Subtract the lower single-precision (32-bit) floating-point element in b from
+// the lower single-precision (32-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper 3 packed elements from
+// a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] - b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss
+FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_sub_ps(a, b));
+}
+
+// Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a,
+// and store the results in dst.
+//    r0 := a0 - b0
+//    r1 := a1 - b1
+FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or
+// unsigned 32-bit integers of a.
+//
+//   r0 := a0 - b0
+//   r1 := a1 - b1
+//   r2 := a2 - b2
+//   r3 := a3 - b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16
+FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8
+FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] - b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64
+FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit
+// integers of a and saturates..
+// https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit
+// integers of a and saturates.
+//
+//   r0 := UnsignedSaturate(a0 - b0)
+//   r1 := UnsignedSaturate(a1 - b1)
+//   ...
+//   r15 := UnsignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+#define _mm_ucomieq_sd _mm_comieq_sd
+#define _mm_ucomige_sd _mm_comige_sd
+#define _mm_ucomigt_sd _mm_comigt_sd
+#define _mm_ucomile_sd _mm_comile_sd
+#define _mm_ucomilt_sd _mm_comilt_sd
+#define _mm_ucomineq_sd _mm_comineq_sd
+
+// Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r15 := SignedSaturate(a15 - b15)
+//
+// https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers
+// of a and saturates.
+//
+//   r0 := SignedSaturate(a0 - b0)
+//   r1 := SignedSaturate(a1 - b1)
+//   ...
+//   r7 := SignedSaturate(a7 - b7)
+//
+// https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90)
+FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Subtract packed double-precision (64-bit) floating-point elements in b from
+// packed double-precision (64-bit) floating-point elements in a, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*64
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ENDFOR
+//
+//  https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd
+FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] - db[0];
+    c[1] = da[1] - db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Subtract the lower double-precision (64-bit) floating-point element in b from
+// the lower double-precision (64-bit) floating-point element in a, store the
+// result in the lower element of dst, and copy the upper element from a to the
+// upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd
+FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_sub_pd(a, b));
+}
+
+// Add packed unsigned 16-bit integers in a and b using saturation, and store
+// the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16
+FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed
+// 8-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..15
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
+{
+    int8x16_t a = vreinterpretq_s8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
+#else
+    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x16_t res = vbicq_s8(masked, zeroMask);
+
+    return vreinterpretq_m128i_s8(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed
+// 16-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..7
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
+#else
+    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
+    // 'a') based on ltMask
+    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x8_t res = vbicq_s16(masked, zeroMask);
+    return vreinterpretq_m128i_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed
+// 32-bit integer in b is negative, and store the results in dst.
+// Element in dst are zeroed out when the corresponding element
+// in b is zero.
+//
+//   for i in 0..3
+//     if b[i] < 0
+//       r[i] := -a[i]
+//     else if b[i] == 0
+//       r[i] := 0
+//     else
+//       r[i] := a[i]
+//     fi
+//   done
+FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
+#else
+    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
+#endif
+
+    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
+    // 'a') based on ltMask
+    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x4_t res = vbicq_s32(masked, zeroMask);
+    return vreinterpretq_m128i_s32(res);
+}
+
+// Negate packed 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      IF b[i+15:i] < 0
+//        dst[i+15:i] := -(a[i+15:i])
+//      ELSE IF b[i+15:i] == 0
+//        dst[i+15:i] := 0
+//      ELSE
+//        dst[i+15:i] := a[i+15:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16
+FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
+{
+    int16x4_t a = vreinterpret_s16_m64(_a);
+    int16x4_t b = vreinterpret_s16_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFF : 0
+    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
+
+    // (b == 0) ? 0xFFFF : 0
+#if defined(__aarch64__)
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
+#else
+    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a')
+    // based on ltMask
+    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
+    // res = masked & (~zeroMask)
+    int16x4_t res = vbic_s16(masked, zeroMask);
+
+    return vreinterpret_m64_s16(res);
+}
+
+// Negate packed 32-bit integers in a when the corresponding signed 32-bit
+// integer in b is negative, and store the results in dst. Element in dst are
+// zeroed out when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 1
+//      i := j*32
+//      IF b[i+31:i] < 0
+//        dst[i+31:i] := -(a[i+31:i])
+//      ELSE IF b[i+31:i] == 0
+//        dst[i+31:i] := 0
+//      ELSE
+//        dst[i+31:i] := a[i+31:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32
+FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
+{
+    int32x2_t a = vreinterpret_s32_m64(_a);
+    int32x2_t b = vreinterpret_s32_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFFFFFFFF : 0
+    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
+
+    // (b == 0) ? 0xFFFFFFFF : 0
+#if defined(__aarch64__)
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
+#else
+    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a')
+    // based on ltMask
+    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
+    // res = masked & (~zeroMask)
+    int32x2_t res = vbic_s32(masked, zeroMask);
+
+    return vreinterpret_m64_s32(res);
+}
+
+// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
+// in b is negative, and store the results in dst. Element in dst are zeroed out
+// when the corresponding element in b is zero.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      IF b[i+7:i] < 0
+//        dst[i+7:i] := -(a[i+7:i])
+//      ELSE IF b[i+7:i] == 0
+//        dst[i+7:i] := 0
+//      ELSE
+//        dst[i+7:i] := a[i+7:i]
+//      FI
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8
+FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
+{
+    int8x8_t a = vreinterpret_s8_m64(_a);
+    int8x8_t b = vreinterpret_s8_m64(_b);
+
+    // signed shift right: faster than vclt
+    // (b < 0) ? 0xFF : 0
+    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
+
+    // (b == 0) ? 0xFF : 0
+#if defined(__aarch64__)
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
+#else
+    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
+#endif
+
+    // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a')
+    // based on ltMask
+    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
+    // res = masked & (~zeroMask)
+    int8x8_t res = vbic_s8(masked, zeroMask);
+
+    return vreinterpret_m64_s8(res);
+}
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16
+FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(
+        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8
+FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Average packed unsigned 8-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 7
+//     i := j*8
+//     dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb
+#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
+
+// Average packed unsigned 16-bit integers in a and b, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//     i := j*16
+//     dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw
+#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw
+#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw
+#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Create mask from the most significant bit of each 8-bit element in a, and
+// store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmovmskb
+#define _m_pmovmskb(a) _mm_movemask_pi8(a)
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Computes the average of the 16 unsigned 8-bit integers in a and the 16
+// unsigned 8-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r15 := (a15 + b15) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Shift a left by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bslli_si128
+#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
+
+// Shift a right by imm8 bytes while shifting in zeros, and store the results in
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_bsrli_si128
+#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
+
+// Computes the average of the 8 unsigned 16-bit integers in a and the 8
+// unsigned 16-bit integers in b and rounds.
+//
+//   r0 := (a0 + b0) / 2
+//   r1 := (a1 + b1) / 2
+//   ...
+//   r7 := (a7 + b7) / 2
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
+{
+    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
+                                 vreinterpretq_u16_m128i(b));
+}
+
+// Adds the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Add packed double-precision (64-bit) floating-point elements in a and b, and
+// store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd
+FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1] + db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add the lower double-precision (64-bit) floating-point element in a and b,
+// store the result in the lower element of dst, and copy the upper element from
+// a to the upper element of dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//   dst[127:64] := a[127:64]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd
+FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_add_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] + db[0];
+    c[1] = da[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Add 64-bit integers a and b, and store the result in dst.
+//
+//   dst[63:0] := a[63:0] + b[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64
+FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s64(
+        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
+}
+
+// adds the scalar single-precision floating point values of a and b.
+// https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
+{
+    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
+    // the upper values in the result must be the remnants of <a>.
+    return vreinterpretq_m128_f32(vaddq_f32(a, value));
+}
+
+// Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s64(
+        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+}
+
+// Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or
+// unsigned 32-bit integers in b.
+//
+//   r0 := a0 + b0
+//   r1 := a1 + b1
+//   r2 := a2 + b2
+//   r3 := a3 + b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or
+// unsigned 16-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or
+// unsigned 8-bit integers in b.
+// https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90)
+FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b
+// and saturates.
+//
+//   r0 := SignedSaturate(a0 + b0)
+//   r1 := SignedSaturate(a1 + b1)
+//   ...
+//   r7 := SignedSaturate(a7 + b7)
+//
+// https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Add packed signed 8-bit integers in a and b using saturation, and store the
+// results in dst.
+//
+//   FOR j := 0 to 15
+//     i := j*8
+//     dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] )
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8
+FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in
+// b and saturates..
+// https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or
+// unsigned 16-bit integers from b.
+//
+//   r0 := (a0 * b0)[15:0]
+//   r1 := (a1 * b1)[15:0]
+//   ...
+//   r7 := (a7 * b7)[15:0]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or
+// unsigned 32-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      tmp[31:0] := a[i+15:i] * b[i+15:i]
+//      dst[i+15:i] := tmp[31:16]
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw
+#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
+
+// Multiplies the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 * b0
+//   r1 := a1 * b1
+//   r2 := a2 * b2
+//   r3 := a3 * b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_f32(
+        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Multiply packed double-precision (64-bit) floating-point elements in a and b,
+// and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd
+FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] * db[0];
+    c[1] = da[1] * db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Multiply the lower double-precision (64-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper element
+// from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd
+FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_mul_pd(a, b));
+}
+
+// Multiply the lower single-precision (32-bit) floating-point element in a and
+// b, store the result in the lower element of dst, and copy the upper 3 packed
+// elements from a to the upper elements of dst.
+//
+//   dst[31:0] := a[31:0] * b[31:0]
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss
+FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_mul_ps(a, b));
+}
+
+// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
+// a and b, and store the unsigned 64-bit results in dst.
+//
+//   r0 :=  (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF)
+//   r1 :=  (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF)
+FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
+{
+    // vmull_u32 upcasts instead of masking, so we downcast.
+    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
+    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
+    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
+}
+
+// Multiply the low unsigned 32-bit integers from a and b, and store the
+// unsigned 64-bit result in dst.
+//
+//   dst[63:0] := a[31:0] * b[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32
+FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u64(vget_low_u64(
+        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
+}
+
+// Multiply the low signed 32-bit integers from each packed 64-bit element in
+// a and b, and store the signed 64-bit results in dst.
+//
+//   r0 :=  (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0
+//   r1 :=  (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2
+FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
+{
+    // vmull_s32 upcasts instead of masking, so we downcast.
+    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
+    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0) + (a1 * b1)
+//   r1 := (a2 * b2) + (a3 * b3)
+//   r2 := (a4 * b4) + (a5 * b5)
+//   r3 := (a6 * b6) + (a7 * b7)
+// https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
+{
+    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                              vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                               vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
+    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
+
+    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint. mem_addr does not need to be aligned
+// on any particular boundary.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmoveu_si128
+FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
+{
+    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x16_t masked =
+        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
+                 vreinterpretq_s8_m128(b));
+    vst1q_s8((int8_t *) mem_addr, masked);
+}
+
+// Multiply packed signed 16-bit integers in a and b, producing intermediate
+// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
+// the packed 16-bit integers in dst.
+//
+//   r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15)
+//   r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15)
+//   r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15)
+//   ...
+//   r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15)
+FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
+{
+    // Has issues due to saturation
+    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
+
+    // Multiply
+    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
+                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
+
+    // Rounding narrowing shift right
+    // narrow = (int16_t)((mul + 16384) >> 15);
+    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
+    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
+
+    // Join together
+    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
+}
+
+// Vertically multiply each unsigned 8-bit integer from a with the corresponding
+// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
+// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
+// and pack the saturated results in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*16
+//      dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] +
+//      a[i+7:i]*b[i+7:i] )
+//   ENDFOR
+FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    uint8x16_t a = vreinterpretq_u8_m128i(_a);
+    int8x16_t b = vreinterpretq_s8_m128i(_b);
+    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
+                             vmovl_s8(vget_low_s8(b)));
+    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
+                             vmovl_s8(vget_high_s8(b)));
+    return vreinterpretq_m128i_s16(
+        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
+#else
+    // This would be much simpler if x86 would choose to zero extend OR sign
+    // extend, not both. This could probably be optimized better.
+    uint16x8_t a = vreinterpretq_u16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+
+    // Zero extend a
+    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
+    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
+
+    // Sign extend by shifting left then shifting right.
+    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
+    int16x8_t b_odd = vshrq_n_s16(b, 8);
+
+    // multiply
+    int16x8_t prod1 = vmulq_s16(a_even, b_even);
+    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
+
+    // saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
+#endif
+}
+
+// Computes the fused multiple add product of 32-bit floating point numbers.
+//
+// Return Value
+// Multiplies A and B, and adds C to the temporary result before returning it.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd
+FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c),
+                                            vreinterpretq_f32_m128(b),
+                                            vreinterpretq_f32_m128(a)));
+#else
+    return _mm_add_ps(_mm_mul_ps(a, b), c);
+#endif
+}
+
+// Alternatively add and subtract packed double-precision (64-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+//
+// FOR j := 0 to 1
+//   i := j*64
+//   IF ((j & 1) == 0)
+//     dst[i+63:i] := a[i+63:i] - b[i+63:i]
+//   ELSE
+//     dst[i+63:i] := a[i+63:i] + b[i+63:i]
+//   FI
+// ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_addsub_pd
+FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
+{
+    __m128d mask = _mm_set_pd(1.0f, -1.0f);
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
+                                             vreinterpretq_f64_m128d(b),
+                                             vreinterpretq_f64_m128d(mask)));
+#else
+    return _mm_add_pd(_mm_mul_pd(b, mask), a);
+#endif
+}
+
+// Alternatively add and subtract packed single-precision (32-bit)
+// floating-point elements in a to/from packed elements in b, and store the
+// results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps
+FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
+{
+    __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f};
+    return _mm_fmadd_ps(b, mask, a);
+}
+
+// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
+// elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd
+FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[] = {da[0] + da[1], db[0] + db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce two
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of 64-bit elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8
+FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
+{
+    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    uint16_t r4 = t[4] + t[5] + t[6] + t[7];
+    uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0);
+    return (__m128i) vsetq_lane_u16(r4, r, 4);
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8
+FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
+{
+    uint16x4_t t =
+        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+    uint16_t r0 = t[0] + t[1] + t[2] + t[3];
+    return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0));
+}
+
+// Compute the absolute differences of packed unsigned 8-bit integers in a and
+// b, then horizontally sum each consecutive 8 differences to produce four
+// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
+// 16 bits of dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i])
+//   ENDFOR
+//   dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] +
+//   tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw
+#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
+
+// Divides the four single-precision, floating-point values of a and b.
+//
+//   r0 := a0 / b0
+//   r1 := a1 / b1
+//   r2 := a2 / b2
+//   r3 := a3 / b3
+//
+// https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV
+    return vreinterpretq_m128_f32(
+        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
+#endif
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
+#endif
+}
+
+// Divides the scalar single-precision floating point value of a by b.
+// https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Divide packed double-precision (64-bit) floating-point elements in a by
+// packed elements in b, and store the results in dst.
+//
+//  FOR j := 0 to 1
+//    i := 64*j
+//    dst[i+63:i] := a[i+63:i] / b[i+63:i]
+//  ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd
+FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2];
+    c[0] = da[0] / db[0];
+    c[1] = da[1] / db[1];
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Divide the lower double-precision (64-bit) floating-point element in a by the
+// lower double-precision (64-bit) floating-point element in b, store the result
+// in the lower element of dst, and copy the upper element from a to the upper
+// element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd
+FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    float64x2_t tmp =
+        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
+    return vreinterpretq_m128d_f64(
+        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
+#else
+    return _mm_move_sd(a, _mm_div_pd(a, b));
+#endif
+}
+
+// Compute the approximate reciprocal of packed single-precision (32-bit)
+// floating-point elements in a, and store the results in dst. The maximum
+// relative error for this approximation is less than 1.5*2^-12.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps
+FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
+{
+    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#if SSE2NEON_PRECISE_DIV
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
+#endif
+    return vreinterpretq_m128_f32(recip);
+}
+
+// Compute the approximate reciprocal of the lower single-precision (32-bit)
+// floating-point element in a, store the result in the lower element of dst,
+// and copy the upper 3 packed elements from a to the upper elements of dst. The
+// maximum relative error for this approximation is less than 1.5*2^-12.
+//
+//   dst[31:0] := (1.0 / a[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss
+FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
+{
+    return _mm_move_ss(a, _mm_rcp_ps(a));
+}
+
+// Computes the approximations of square roots of the four single-precision,
+// floating-point values of a. First computes reciprocal square roots and then
+// reciprocals of the four values.
+//
+//   r0 := sqrt(a0)
+//   r1 := sqrt(a1)
+//   r2 := sqrt(a2)
+//   r3 := sqrt(a3)
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
+{
+#if SSE2NEON_PRECISE_SQRT
+    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+
+    // Test for vrsqrteq_f32(0) -> positive infinity case.
+    // Change to zero, so that s * 1/sqrt(s) result is zero too.
+    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
+    const uint32x4_t div_by_zero =
+        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
+    recip = vreinterpretq_f32_u32(
+        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
+
+    // Additional Netwon-Raphson iteration for accuracy
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+    recip = vmulq_f32(
+        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
+        recip);
+
+    // sqrt(s) = s * 1/sqrt(s)
+    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
+#elif defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
+#else
+    float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+    float32x4_t sq = vrecpeq_f32(recipsq);
+    return vreinterpretq_m128_f32(sq);
+#endif
+}
+
+// Computes the approximation of the square root of the scalar single-precision
+// floating point value of in.
+// https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
+{
+    float32_t value =
+        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
+}
+
+// Computes the approximations of the reciprocal square roots of the four
+// single-precision floating point values of in.
+// The current precision is 1% error.
+// https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
+{
+    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
+#if SSE2NEON_PRECISE_SQRT
+    // Additional Netwon-Raphson iteration for accuracy
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+    out = vmulq_f32(
+        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
+#endif
+    return vreinterpretq_m128_f32(out);
+}
+
+// Compute the approximate reciprocal square root of the lower single-precision
+// (32-bit) floating-point element in a, store the result in the lower element
+// of dst, and copy the upper 3 packed elements from a to the upper elements of
+// dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss
+FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
+{
+    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MAX(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16
+#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
+
+// Computes the maximums of the four single-precision, floating-point values of
+// a and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_b, _a), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MAX(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8
+#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Compare packed signed 16-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      dst[i+15:i] := MIN(a[i+15:i], b[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16
+#define _m_pminsw(a, b) _mm_min_pi16(a, b)
+
+// Computes the minima of the four single-precision, floating-point values of a
+// and b.
+// https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
+{
+#if SSE2NEON_PRECISE_MINMAX
+    float32x4_t _a = vreinterpretq_f32_m128(a);
+    float32x4_t _b = vreinterpretq_f32_m128(b);
+    return vbslq_f32(vcltq_f32(_a, _b), _a, _b);
+#else
+    return vreinterpretq_m128_f32(
+        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#endif
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u8(
+        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
+}
+
+// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
+// values in dst.
+//
+//   FOR j := 0 to 7
+//      i := j*8
+//      dst[i+7:i] := MIN(a[i+7:i], b[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8
+#define _m_pminub(a, b) _mm_min_pu8(a, b)
+
+// Computes the maximum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the minimum of the two lower scalar single-precision floating point
+// values of a and b.
+// https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
+{
+    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
+}
+
+// Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed maximum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pd
+FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the maximum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_sd
+FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_max_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {fmax(da[0], db[0]), da[1]};
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the
+// 16 unsigned 8-bit integers from b.
+// https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx
+FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b,
+// and store packed minimum values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pd
+FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b, store the minimum value in the lower element of dst, and copy the upper
+// element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_sd
+FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_min_pd(a, b));
+#else
+    double *da = (double *) &a;
+    double *db = (double *) &b;
+    double c[2] = {fmin(da[0], db[0]), da[1]};
+    return vld1q_f32((float32_t *) c);
+#endif
+}
+
+// Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8
+FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16
+FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Compare packed signed 8-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8
+FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16
+FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
+}
+
+// Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8
+// signed 16-bit integers from b.
+// https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// epi versions of min/max
+// Computes the pariwise maximums of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 > b0) ? a0 : b0
+//   r1 := (a1 > b1) ? a1 : b1
+//   r2 := (a2 > b2) ? a2 : b2
+//   r3 := (a3 > b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Computes the pariwise minima of the four signed 32-bit integer values of a
+// and b.
+//
+// A 128-bit parameter that can be defined with the following equations:
+//   r0 := (a0 < b0) ? a0 : b0
+//   r1 := (a1 < b1) ? a1 : b1
+//   r2 := (a2 < b2) ? a2 : b2
+//   r3 := (a3 < b3) ? a3 : b3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s32(
+        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
+// values in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32
+FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16
+FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_u16(vshrn_n_u32(
+        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
+}
+
+// Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit
+// integers from b.
+//
+//   r0 := (a0 * b0)[31:16]
+//   r1 := (a1 * b1)[31:16]
+//   ...
+//   r7 := (a7 * b7)[31:16]
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
+{
+    /* FIXME: issue with large values because of result saturation */
+    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
+    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
+    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
+    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
+    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+}
+
+// Multiply the packed unsigned 16-bit integers in a and b, producing
+// intermediate 32-bit integers, and store the high 16 bits of the intermediate
+// integers in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16
+FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
+{
+    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
+#if defined(__aarch64__)
+    uint32x4_t ab7654 =
+        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
+    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
+                              vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r);
+#else
+    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
+    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
+    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
+    uint16x8x2_t r =
+        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
+    return vreinterpretq_m128i_u16(r.val[1]);
+#endif
+}
+
+// Computes pairwise add of each argument as single-precision, floating-point
+// values a and b.
+// https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx
+FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
+#endif
+}
+
+// Computes pairwise add of each argument as a 16-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
+{
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
+#else
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
+                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
+#endif
+}
+
+// Horizontally subtract adjacent pairs of double-precision (64-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_pd
+FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vsubq_f64(
+        vuzp1q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b)),
+        vuzp2q_f64(vreinterpretq_f64_m128d(_a), vreinterpretq_f64_m128d(_b))));
+#else
+    double *da = (double *) &_a;
+    double *db = (double *) &_b;
+    double c[] = {da[0] - da[1], db[0] - db[1]};
+    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
+#endif
+}
+
+// Horizontally substract adjacent pairs of single-precision (32-bit)
+// floating-point elements in a and b, and pack the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps
+FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(vsubq_f32(
+        vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)),
+        vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b))));
+#else
+    float32x4x2_t c =
+        vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b));
+    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
+#endif
+}
+
+// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
+// signed 16-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16
+FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s16(
+        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
+}
+
+// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
+// signed 32-bit results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32
+FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
+{
+    return vreinterpret_m64_s32(
+        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
+}
+
+// Computes pairwise difference of each argument as a 16-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Subtract
+    return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357));
+}
+
+// Computes saturated pairwise sub of each argument as a 16-bit signed
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated add
+    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes saturated pairwise difference of each argument as a 16-bit signed
+// integer values a and b.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16
+FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
+{
+#if defined(__aarch64__)
+    int16x8_t a = vreinterpretq_s16_m128i(_a);
+    int16x8_t b = vreinterpretq_s16_m128i(_b);
+    return vreinterpretq_s64_s16(
+        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
+#else
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|a4|a6|b0|b2|b4|b6]
+    // [a1|a3|a5|a7|b1|b3|b5|b7]
+    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
+    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
+    // Saturated subtract
+    return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357));
+#endif
+}
+
+// Computes pairwise add of each argument as a 32-bit signed or unsigned integer
+// values a and b.
+FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
+{
+    int32x4_t a = vreinterpretq_s32_m128i(_a);
+    int32x4_t b = vreinterpretq_s32_m128i(_b);
+    return vreinterpretq_m128i_s32(
+        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
+                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
+}
+
+// Computes pairwise difference of each argument as a 32-bit signed or unsigned
+// integer values a and b.
+FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
+{
+    int64x2_t a = vreinterpretq_s64_m128i(_a);
+    int64x2_t b = vreinterpretq_s64_m128i(_b);
+    // Interleave using vshrn/vmovn
+    // [a0|a2|b0|b2]
+    // [a1|a2|b1|b3]
+    int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b));
+    int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32));
+    // Subtract
+    return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13));
+}
+
+// Kahan summation for accurate summation of floating-point numbers.
+// http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html
+FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y)
+{
+    y -= *c;
+    float t = *sum + y;
+    *c = (t - *sum) - y;
+    *sum = t;
+}
+
+// Conditionally multiply the packed single-precision (32-bit) floating-point
+// elements in a and b using the high 4 bits in imm8, sum the four products,
+// and conditionally store the sum in dst using the low 4 bits of imm.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps
+FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
+{
+#if defined(__aarch64__)
+    /* shortcuts */
+    if (imm == 0xFF) {
+        return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b)));
+    }
+    if (imm == 0x7F) {
+        float32x4_t m = _mm_mul_ps(a, b);
+        m[3] = 0;
+        return _mm_set1_ps(vaddvq_f32(m));
+    }
+#endif
+
+    float s = 0, c = 0;
+    float32x4_t f32a = vreinterpretq_f32_m128(a);
+    float32x4_t f32b = vreinterpretq_f32_m128(b);
+
+    /* To improve the accuracy of floating-point summation, Kahan algorithm
+     * is used for each operation.
+     */
+    if (imm & (1 << 4))
+        _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]);
+    if (imm & (1 << 5))
+        _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]);
+    if (imm & (1 << 6))
+        _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]);
+    if (imm & (1 << 7))
+        _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]);
+    s += c;
+
+    float32x4_t res = {
+        (imm & 0x1) ? s : 0,
+        (imm & 0x2) ? s : 0,
+        (imm & 0x4) ? s : 0,
+        (imm & 0x8) ? s : 0,
+    };
+    return vreinterpretq_m128_f32(res);
+}
+
+/* Compare operations */
+
+// Compares for less than
+// https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100)
+FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
+}
+
+// Compares for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffffffff : 0x0
+//   r1 := (a1 > b1) ? 0xffffffff : 0x0
+//   r2 := (a2 > b2) ? 0xffffffff : 0x0
+//   r3 := (a3 > b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
+}
+
+// Compares for greater than or equal.
+// https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
+}
+
+// Compares for less than or equal.
+//
+//   r0 := (a0 <= b0) ? 0xffffffff : 0x0
+//   r1 := (a1 <= b1) ? 0xffffffff : 0x0
+//   r2 := (a2 <= b2) ? 0xffffffff : 0x0
+//   r3 := (a3 <= b3) ? 0xffffffff : 0x0
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmple_ps(a, b));
+}
+
+// Compares for equality.
+// https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+}
+
+// Compares for equality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
+}
+
+// Compares for inequality.
+// https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
+{
+    return vreinterpretq_m128_u32(vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
+}
+
+// Compares for inequality.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ps(a, b);
+}
+
+// Compares for not greater than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
+{
+    return _mm_cmplt_ss(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmple_ps(a, b);
+}
+
+// Compares for not greater than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmple_ss(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ps(a, b);
+}
+
+// Compares for not less than or equal.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpgt_ss(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ps(a, b);
+}
+
+// Compares for not less than.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
+{
+    return _mm_cmpge_ss(a, b);
+}
+
+// Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or
+// unsigned 8-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for equality, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd
+FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for equality, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_sd
+FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_pd
+FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than-or-equal, store the result in the lower element of dst,
+// and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpge_sd
+FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or
+// unsigned 16-bit integers in b for equality.
+// https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compare packed 32-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compare packed 64-bit integers in a and b for equality, and store the results
+// in dst
+FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
+#else
+    // ARMv7 lacks vceqq_u64
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
+#endif
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for lesser than.
+// https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_pd
+FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmplt_sd
+FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_pd
+FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
+        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
+#else
+    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
+    uint32x4_t cmp =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint32x4_t swapped = vrev64q_u32(cmp);
+    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-equal, store the result in the lower element of dst, and copy the
+// upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpneq_sd
+FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
+{
+    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for not-greater-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_pd
+FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
+{
+    return _mm_cmplt_pd(a, b);
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for not-greater-than-or-equal, store the result in the lower element of
+// dst, and copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpnge_sd
+FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
+{
+    return _mm_cmplt_sd(a, b);
+}
+
+// Compare the lower double-precision (64-bit) floating-point element in a and b
+// for equality, and return the boolean result (0 or 1).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_comieq_sd
+FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return !!vgetq_lane_u64(vceqq_f64(a, b), 0);
+#else
+    uint32x4_t a_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
+    uint32x4_t b_not_nan =
+        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
+    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
+                                       vreinterpretq_u64_u32(a_eq_b));
+    return !!vgetq_lane_u64(and_results, 0);
+#endif
+}
+
+// Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xff : 0x0
+//   r1 := (a1 > b1) ? 0xff : 0x0
+//   ...
+//   r15 := (a15 > b15) ? 0xff : 0x0
+//
+// https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for greater-than, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_pd
+FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for greater-than, store the result in the lower element of dst, and copy
+// the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpgt_sd
+FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare packed double-precision (64-bit) floating-point elements in a and b
+// for less-than-or-equal, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_pd
+FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_u64(
+        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compare the lower double-precision (64-bit) floating-point elements in a and
+// b for less-than-or-equal, store the result in the lower element of dst, and
+// copy the upper element from a to the upper element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmple_sd
+FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return _mm_move_sd(a, _mm_cmple_pd(a, b));
+#else
+    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
+    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
+    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
+    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
+    uint64_t d[2];
+    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
+
+    return vreinterpretq_m128d_u64(vld1q_u64(d));
+#endif
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for less than.
+//
+//   r0 := (a0 < b0) ? 0xffff : 0x0
+//   r1 := (a1 < b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 < b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+// Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers
+// in b for greater than.
+//
+//   r0 := (a0 > b0) ? 0xffff : 0x0
+//   r1 := (a1 > b1) ? 0xffff : 0x0
+//   ...
+//   r7 := (a7 > b7) ? 0xffff : 0x0
+//
+// https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+}
+
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for less than.
+// https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers
+// in b for greater than.
+// https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u32(
+        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
+// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
+// in b for greater than.
+FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_u64(
+        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
+#else
+    return vreinterpretq_m128i_s64(vshrq_n_s64(
+        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
+        63));
+#endif
+}
+
+// Compares the four 32-bit floats in a and b to check if any values are NaN.
+// Ordered compare between each value returns true for "orderable" and false for
+// "not orderable" (NaN).
+// https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see
+// also:
+// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
+// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
+FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
+{
+    // Note: NEON does not have ordered compare builtin
+    // Need to compare a eq a and b eq b to check for NaN
+    // Do AND of results to get final
+    uint32x4_t ceqaa =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t ceqbb =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
+}
+
+// Compares for ordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
+{
+    uint32x4_t f32a =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t f32b =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
+}
+
+// Compares for unordered.
+// https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100)
+FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than operation. :
+// https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important
+// note!! The documentation on MSDN is incorrect!  If either of the values is a
+// NAN the docs say you will get a one, but in fact, it will return a zero!!
+FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
+{
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_lt_b =
+        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than operation. :
+// https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx
+FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_gt_b =
+        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a less than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx
+FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_le_b =
+        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using a greater than or equal operation. :
+// https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx
+FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_ge_b =
+        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an equality operation. :
+// https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx
+FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
+{
+    // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
+    uint32x4_t a_eq_b =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
+    return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0;
+}
+
+// Compares the lower single-precision floating point scalar values of a and b
+// using an inequality operation. :
+// https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx
+FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
+{
+    // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a),
+    // vreinterpretq_f32_m128(b)), 0);
+    uint32x4_t a_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
+    uint32x4_t b_not_nan =
+        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
+    uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
+    uint32x4_t a_neq_b = vmvnq_u32(
+        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+    return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0;
+}
+
+// according to the documentation, these intrinsics behave the same as the
+// non-'u' versions.  We'll just alias them here.
+#define _mm_ucomieq_ss _mm_comieq_ss
+#define _mm_ucomige_ss _mm_comige_ss
+#define _mm_ucomigt_ss _mm_comigt_ss
+#define _mm_ucomile_ss _mm_comile_ss
+#define _mm_ucomilt_ss _mm_comilt_ss
+#define _mm_ucomineq_ss _mm_comineq_ss
+
+/* Conversions */
+
+// Convert packed signed 32-bit integers in b to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, and copy the upper 2 packed elements from a to the upper elements of
+// dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps
+FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss
+FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the signed 32-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss
+#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
+
+// Convert the signed 64-bit integer b to a single-precision (32-bit)
+// floating-point element, store the result in the lower element of dst, and
+// copy the upper 3 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int64_To_FP32(b[63:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss
+FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
+{
+    return vreinterpretq_m128_f32(
+        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
+}
+
+// Convert the lower single-precision (32-bit) floating-point element in a to a
+// 32-bit integer, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si
+FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0);
+#else
+    float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    float32_t diff = data - floor(data);
+    if (diff > 0.5)
+        return (int32_t) ceil(data);
+    if (unlikely(diff == 0.5)) {
+        int32_t f = (int32_t) floor(data);
+        int32_t c = (int32_t) ceil(data);
+        return c & 1 ? f : c;
+    }
+    return (int32_t) floor(data);
+#endif
+}
+
+// Convert packed 16-bit integers in a to packed single-precision (32-bit)
+// floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps
+FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
+}
+
+// Convert packed 32-bit integers in b to packed single-precision (32-bit)
+// floating-point elements, store the results in the lower 2 elements of dst,
+// and copy the upper 2 packed elements from a to the upper elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(b[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(b[63:32])
+//   dst[95:64] := a[95:64]
+//   dst[127:96] := a[127:96]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps
+FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(
+        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
+                     vget_high_f32(vreinterpretq_f32_m128(a))));
+}
+
+// Convert packed signed 32-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, store the results in the lower 2 elements
+// of dst, then covert the packed signed 32-bit integers in b to
+// single-precision (32-bit) floating-point element, and store the results in
+// the upper 2 elements of dst.
+//
+//   dst[31:0] := Convert_Int32_To_FP32(a[31:0])
+//   dst[63:32] := Convert_Int32_To_FP32(a[63:32])
+//   dst[95:64] := Convert_Int32_To_FP32(b[31:0])
+//   dst[127:96] := Convert_Int32_To_FP32(b[63:32])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps
+FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
+}
+
+// Convert the lower packed 8-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps
+FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(
+        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
+}
+
+// Convert packed unsigned 16-bit integers in a to packed single-precision
+// (32-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 3
+//      i := j*16
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps
+FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(
+        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
+}
+
+// Convert the lower packed unsigned 8-bit integers in a to packed
+// single-precision (32-bit) floating-point elements, and store the results in
+// dst.
+//
+//   FOR j := 0 to 3
+//      i := j*8
+//      m := j*32
+//      dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps
+FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_u32(
+        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values using truncate.
+// https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64
+FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
+{
+#if defined(__aarch64__)
+    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
+#else
+    double ret = *((double *) &a);
+    return (int64_t) ret;
+#endif
+}
+
+// Convert the lower double-precision (64-bit) floating-point element in a to a
+// 64-bit integer with truncation, and store the result in dst.
+//
+//   dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x
+#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
+
+// Converts the four signed 32-bit integer values of a to single-precision,
+// floating-point values
+// https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx
+FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
+{
+    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtepi32_pd
+FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
+#else
+    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Convert packed signed 32-bit integers in a to packed double-precision
+// (64-bit) floating-point elements, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//     i := j*32
+//     m := j*64
+//     dst[m+63:m] := Convert_Int32_To_FP64(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_pd
+FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
+#else
+    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
+    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_u16(u16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx
+FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_u32(u32x4);
+}
+
+// Converts the two unsigned 8-bit integers in the lower 16 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
+{
+    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 16 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
+    return vreinterpretq_m128i_s16(s16x8);
+}
+
+// Converts the four unsigned 8-bit integers in the lower 32 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
+    return vreinterpretq_m128i_s32(s32x4);
+}
+
+// Converts the two signed 8-bit integers in the lower 32 bits to four
+// signed 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
+{
+    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
+    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four signed 16-bit integers in the lower 64 bits to four signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_s32(
+        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
+}
+
+// Converts the two signed 16-bit integers in the lower 32 bits two signed
+// 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
+{
+    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
+    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_s64(s64x2);
+}
+
+// Converts the four unsigned 16-bit integers in the lower 64 bits to four
+// unsigned 32-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
+{
+    return vreinterpretq_m128i_u32(
+        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
+}
+
+// Converts the two unsigned 16-bit integers in the lower 32 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
+{
+    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
+    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
+    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
+    return vreinterpretq_m128i_u64(u64x2);
+}
+
+// Converts the two unsigned 32-bit integers in the lower 64 bits to two
+// unsigned 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_u64(
+        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
+}
+
+// Converts the two signed 32-bit integers in the lower 64 bits to two signed
+// 64-bit integers.
+FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
+{
+    return vreinterpretq_m128i_s64(
+        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
+}
+
+// Converts the four single-precision, floating-point values of a to signed
+// 32-bit integer values.
+//
+//   r0 := (int) a0
+//   r1 := (int) a1
+//   r2 := (int) a2
+//   r3 := (int) a3
+//
+// https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx
+// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
+// does not support! It is supported on ARMv8-A however.
+FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
+#else
+    uint32x4_t signmask = vdupq_n_u32(0x80000000);
+    float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
+                                 vdupq_n_f32(0.5f)); /* +/- 0.5 */
+    int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
+        vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
+    int32x4_t r_trunc =
+        vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
+    int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+        vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+    int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                 vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+    float32x4_t delta = vsubq_f32(
+        vreinterpretq_f32_m128(a),
+        vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+    uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+    return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 16-bit integers, and store the results in dst. Note: this intrinsic
+// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
+// 0x7FFFFFFF.
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16
+FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
+{
+    return vreinterpret_m64_s16(
+        vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
+}
+
+// Copy the lower 32-bit integer in a to dst.
+//
+//   dst[31:0] := a[31:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32
+FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
+{
+    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64
+FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
+{
+    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
+}
+
+// Copy the lower 64-bit integer in a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x
+#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
+
+// Moves 32-bit integer a to the least significant 32 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+//   r2 := 0x0
+//   r3 := 0x0
+//
+// https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
+{
+    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
+}
+
+// Moves 64-bit integer a to the least significant 64 bits of an __m128 object,
+// zero extending the upper bits.
+//
+//   r0 := a
+//   r1 := 0x0
+FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
+{
+    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
+}
+
+// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd
+FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
+{
+    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
+}
+
+// Applies a type cast to reinterpret four 32-bit floating point values passed
+// in as a 128-bit parameter as packed 32-bit integers.
+// https://msdn.microsoft.com/en-us/library/bb514099.aspx
+FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
+{
+    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
+}
+
+// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd
+FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
+#else
+    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
+#endif
+}
+
+// Applies a type cast to reinterpret four 32-bit integers passed in as a
+// 128-bit parameter as packed 32-bit floating point values.
+// https://msdn.microsoft.com/en-us/library/bb514029.aspx
+FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
+{
+    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
+}
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx
+FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd
+FORCE_INLINE __m128d _mm_load1_pd(const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
+#else
+    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into the
+// upper element of dst, and copy the lower element from a to dst. mem_addr does
+// not need to be aligned on any particular boundary.
+//
+//   dst[63:0] := a[63:0]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd
+FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
+#else
+    return vreinterpretq_m128d_f32(vcombine_f32(
+        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
+#endif
+}
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1
+#define _mm_load_pd1 _mm_load1_pd
+
+// Load a double-precision (64-bit) floating-point element from memory into both
+// elements of dst.
+//
+//   dst[63:0] := MEM[mem_addr+63:mem_addr]
+//   dst[127:64] := MEM[mem_addr+63:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd
+#define _mm_loaddup_pd _mm_load1_pd
+
+// Loads 128-bit value. :
+// https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx
+FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
+{
+    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
+}
+
+// Load unaligned 32-bit integer from memory into the first element of dst.
+//
+//   dst[31:0] := MEM[mem_addr+31:mem_addr]
+//   dst[MAX:32] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32
+FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
+{
+    return vreinterpretq_m128i_s32(
+        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
+}
+
+// Convert packed double-precision (64-bit) floating-point elements in a to
+// packed single-precision (32-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 32*j
+//     k := 64*j
+//     dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k])
+//   ENDFOR
+//   dst[127:64] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps
+FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
+{
+#if defined(__aarch64__)
+    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
+    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
+#else
+    float a0 = (float) ((double *) &a)[0];
+    float a1 = (float) ((double *) &a)[1];
+    return _mm_set_ps(0, 0, a1, a0);
+#endif
+}
+
+// Copy the lower double-precision (64-bit) floating-point element of a to dst.
+//
+//   dst[63:0] := a[63:0]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64
+FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
+{
+#if defined(__aarch64__)
+    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
+#else
+    return ((double *) &a)[0];
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed double-precision (64-bit) floating-point elements, and store the
+// results in dst.
+//
+//   FOR j := 0 to 1
+//     i := 64*j
+//     k := 32*j
+//     dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd
+FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
+#else
+    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
+    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
+    return _mm_set_pd(a1, a0);
+#endif
+}
+
+// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128
+FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
+{
+    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Cast vector of type __m128d to type __m128. This intrinsic is only used for
+// compilation and does not generate any instructions, thus it has zero latency.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps
+FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
+{
+    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps
+FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
+{
+    // Use a signed shift right to create a mask with the sign bit
+    uint32x4_t mask =
+        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed single-precision (32-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps
+FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
+{
+    const uint32_t ALIGN_STRUCT(16)
+        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
+                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
+    uint32x4_t mask = vld1q_u32(data);
+    float32x4_t a = vreinterpretq_f32_m128(_a);
+    float32x4_t b = vreinterpretq_f32_m128(_b);
+    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
+}
+
+// Blend packed double-precision (64-bit) floating-point elements from a and b
+// using mask, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd
+FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
+{
+    uint64x2_t mask =
+        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
+#if defined(__aarch64__)
+    float64x2_t a = vreinterpretq_f64_m128d(_a);
+    float64x2_t b = vreinterpretq_f64_m128d(_b);
+    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
+#else
+    uint64x2_t a = vreinterpretq_u64_m128d(_a);
+    uint64x2_t b = vreinterpretq_u64_m128d(_b);
+    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
+#endif
+}
+
+typedef struct {
+    uint16_t res0;
+    uint8_t res1 : 6;
+    uint8_t bit22 : 1;
+    uint8_t bit23 : 1;
+    uint8_t res2;
+#if defined(__aarch64__)
+    uint32_t res3;
+#endif
+} fpcr_bitfield;
+
+// Macro: Set the rounding mode bits of the MXCSR control and status register to
+// the value in unsigned 32-bit integer a. The rounding mode may contain any of
+// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
+// _MM_ROUND_TOWARD_ZERO
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE
+FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
+{
+    union {
+        fpcr_bitfield field;
+#if defined(__aarch64__)
+        uint64_t value;
+#else
+        uint32_t value;
+#endif
+    } r;
+
+#if defined(__aarch64__)
+    asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */
+#else
+    asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
+#endif
+
+    switch (rounding) {
+    case _MM_ROUND_TOWARD_ZERO:
+        r.field.bit22 = 1;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_DOWN:
+        r.field.bit22 = 0;
+        r.field.bit23 = 1;
+        break;
+    case _MM_ROUND_UP:
+        r.field.bit22 = 1;
+        r.field.bit23 = 0;
+        break;
+    default:  //_MM_ROUND_NEAREST
+        r.field.bit22 = 0;
+        r.field.bit23 = 0;
+    }
+
+#if defined(__aarch64__)
+    asm volatile("msr FPCR, %0" ::"r"(r)); /* write */
+#else
+    asm volatile("vmsr FPSCR, %0" ::"r"(r));        /* write */
+#endif
+}
+
+FORCE_INLINE void _mm_setcsr(unsigned int a)
+{
+    _MM_SET_ROUNDING_MODE(a);
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a using
+// the rounding parameter, and store the results as packed single-precision
+// floating-point elements in dst.
+// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
+FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
+{
+#if defined(__aarch64__)
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
+    }
+#else
+    float *v_float = (float *) &a;
+    __m128 zero, neg_inf, pos_inf;
+
+    switch (rounding) {
+    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
+        return _mm_cvtepi32_ps(_mm_cvtps_epi32(a));
+    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
+        return (__m128){floorf(v_float[0]), floorf(v_float[1]),
+                        floorf(v_float[2]), floorf(v_float[3])};
+    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
+        return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]),
+                        ceilf(v_float[3])};
+    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
+        zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
+        neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]),
+                             floorf(v_float[2]), floorf(v_float[3]));
+        pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]),
+                             ceilf(v_float[2]), ceilf(v_float[3]));
+        return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero));
+    default:  //_MM_FROUND_CUR_DIRECTION
+        return (__m128){roundf(v_float[0]), roundf(v_float[1]),
+                        roundf(v_float[2]), roundf(v_float[3])};
+    }
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi
+FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
+{
+#if defined(__aarch64__)
+    return vreinterpret_m64_s32(
+        vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))));
+#else
+    return vreinterpret_m64_s32(
+        vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128(
+            _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)))));
+#endif
+}
+
+// Convert packed single-precision (32-bit) floating-point elements in a to
+// packed 32-bit integers, and store the results in dst.
+//
+//   FOR j := 0 to 1
+//       i := 32*j
+//       dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i])
+//   ENDFOR
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32
+#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
+
+// Round the packed single-precision (32-bit) floating-point elements in a up to
+// an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps
+FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b up to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := CEIL(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss
+FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(
+        a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
+}
+
+// Round the packed single-precision (32-bit) floating-point elements in a down
+// to an integer value, and store the results as packed single-precision
+// floating-point elements in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps
+FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
+{
+    return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC);
+}
+
+// Round the lower single-precision (32-bit) floating-point element in b down to
+// an integer value, store the result as a single-precision floating-point
+// element in the lower element of dst, and copy the upper 3 packed elements
+// from a to the upper elements of dst.
+//
+//   dst[31:0] := FLOOR(b[31:0])
+//   dst[127:32] := a[127:32]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss
+FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
+{
+    return _mm_move_ss(
+        a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC));
+}
+
+// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
+// may perform better than _mm_loadu_si128 when the data crosses a cache line
+// boundary.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128
+#define _mm_lddqu_si128 _mm_loadu_si128
+
+/* Miscellaneous Operations */
+
+// Shifts the 8 signed 16-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   ...
+//   r7 := a7 >> count
+//
+// https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx
+FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (unlikely(c > 15))
+        return _mm_cmplt_epi16(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c)));
+}
+
+// Shifts the 4 signed 32-bit integers in a right by count bits while shifting
+// in the sign bit.
+//
+//   r0 := a0 >> count
+//   r1 := a1 >> count
+//   r2 := a2 >> count
+//   r3 := a3 >> count
+//
+// https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx
+FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
+{
+    int64_t c = (int64_t) vget_low_s64((int64x2_t) count);
+    if (unlikely(c > 31))
+        return _mm_cmplt_epi32(a, _mm_setzero_si128());
+    return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c)));
+}
+
+// Packs the 16 signed 16-bit integers from a and b into 8-bit integers and
+// saturates.
+// https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s8(
+        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   ...
+//   r7 := UnsignedSaturate(a7)
+//   r8 := UnsignedSaturate(b0)
+//   r9 := UnsignedSaturate(b1)
+//   ...
+//   r15 := UnsignedSaturate(b7)
+//
+// https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
+                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
+}
+
+// Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers
+// and saturates.
+//
+//   r0 := SignedSaturate(a0)
+//   r1 := SignedSaturate(a1)
+//   r2 := SignedSaturate(a2)
+//   r3 := SignedSaturate(a3)
+//   r4 := SignedSaturate(b0)
+//   r5 := SignedSaturate(b1)
+//   r6 := SignedSaturate(b2)
+//   r7 := SignedSaturate(b3)
+//
+// https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_s16(
+        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit
+// integers and saturates.
+//
+//   r0 := UnsignedSaturate(a0)
+//   r1 := UnsignedSaturate(a1)
+//   r2 := UnsignedSaturate(a2)
+//   r3 := UnsignedSaturate(a3)
+//   r4 := UnsignedSaturate(b0)
+//   r5 := UnsignedSaturate(b1)
+//   r6 := UnsignedSaturate(b2)
+//   r7 := UnsignedSaturate(b3)
+FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u16(
+        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
+                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
+}
+
+// Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   ...
+//   r14 := a7
+//   r15 := b7
+//
+// https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 4 signed or unsigned 16-bit integers in a with the
+// lower 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//   r4 := a2
+//   r5 := b2
+//   r6 := a3
+//   r7 := b3
+//
+// https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the
+// lower 2 signed or unsigned 32 - bit integers in b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
+}
+
+// Selects and interleaves the lower two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a0
+//   r1 := b0
+//   r2 := a1
+//   r3 := b1
+//
+// https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the low half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[63:0]
+//     dst[127:64] := src2[63:0]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd
+FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
+                     vget_low_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Unpack and interleave double-precision (64-bit) floating-point elements from
+// the high half of a and b, and store the results in dst.
+//
+//   DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) {
+//     dst[63:0] := src1[127:64]
+//     dst[127:64] := src2[127:64]
+//     RETURN dst[127:0]
+//   }
+//   dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0])
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd
+FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128d_f64(
+        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
+#else
+    return vreinterpretq_m128d_s64(
+        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
+                     vget_high_s64(vreinterpretq_s64_m128d(b))));
+#endif
+}
+
+// Selects and interleaves the upper two single-precision, floating-point values
+// from a and b.
+//
+//   r0 := a2
+//   r1 := b2
+//   r2 := a3
+//   r3 := b3
+//
+// https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx
+FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128_f32(
+        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
+#else
+    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
+    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
+    float32x2x2_t result = vzip_f32(a1, b1);
+    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper
+// 8 signed or unsigned 8-bit integers in b.
+//
+//   r0 := a8
+//   r1 := b8
+//   r2 := a9
+//   r3 := b9
+//   ...
+//   r14 := a15
+//   r15 := b15
+//
+// https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s8(
+        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
+#else
+    int8x8_t a1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
+    int8x8_t b1 =
+        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
+    int8x8x2_t result = vzip_s8(a1, b1);
+    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 4 signed or unsigned 16-bit integers in a with the
+// upper 4 signed or unsigned 16-bit integers in b.
+//
+//   r0 := a4
+//   r1 := b4
+//   r2 := a5
+//   r3 := b5
+//   r4 := a6
+//   r5 := b6
+//   r6 := a7
+//   r7 := b7
+//
+// https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s16(
+        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
+#else
+    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
+    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
+    int16x4x2_t result = vzip_s16(a1, b1);
+    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper 2 signed or unsigned 32-bit integers in a with the
+// upper 2 signed or unsigned 32-bit integers in b.
+// https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
+{
+#if defined(__aarch64__)
+    return vreinterpretq_m128i_s32(
+        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+#else
+    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
+    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
+    int32x2x2_t result = vzip_s32(a1, b1);
+    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
+#endif
+}
+
+// Interleaves the upper signed or unsigned 64-bit integer in a with the
+// upper signed or unsigned 64-bit integer in b.
+//
+//   r0 := a1
+//   r1 := b1
+FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
+{
+    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
+    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
+    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
+}
+
+// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
+// in a, store the minimum and index in dst, and zero the remaining bits in dst.
+//
+//   index[2:0] := 0
+//   min[15:0] := a[15:0]
+//   FOR j := 0 to 7
+//       i := j*16
+//       IF a[i+15:i] < min[15:0]
+//           index[2:0] := j
+//           min[15:0] := a[i+15:i]
+//       FI
+//   ENDFOR
+//   dst[15:0] := min[15:0]
+//   dst[18:16] := index[2:0]
+//   dst[127:19] := 0
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16
+FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
+{
+    __m128i dst;
+    uint16_t min, idx = 0;
+    // Find the minimum value
+#if defined(__aarch64__)
+    min = vminvq_u16(vreinterpretq_u16_m128i(a));
+#else
+    __m64 tmp;
+    tmp = vreinterpret_m64_u16(
+        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
+                 vget_high_u16(vreinterpretq_u16_m128i(a))));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    tmp = vreinterpret_m64_u16(
+        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
+    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
+#endif
+    // Get the index of the minimum value
+    int i;
+    for (i = 0; i < 8; i++) {
+        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
+            idx = (uint16_t) i;
+            break;
+        }
+        a = _mm_srli_si128(a, 2);
+    }
+    // Generate result
+    dst = _mm_setzero_si128();
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
+    dst = vreinterpretq_m128i_u16(
+        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
+    return dst;
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the CF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128
+FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))),
+                  vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
+// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
+// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
+// otherwise set CF to 0. Return the ZF value.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128
+FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
+{
+    int64x2_t s64 =
+        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
+    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
+}
+
+// Extracts the selected signed or unsigned 8-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm)
+#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
+
+// Inserts the least significant 8 bits of b into the selected 8-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
+//                                      __constrange(0,16) int imm)
+#define _mm_insert_epi8(a, b, imm)                                 \
+    __extension__({                                                \
+        vreinterpretq_m128i_s8(                                    \
+            vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 16-bit integer from a and zero
+// extends.
+// https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx
+// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
+#define _mm_extract_epi16(a, imm) \
+    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
+
+// Extract a 16-bit integer from a, selected with imm8, and store the result in
+// the lower element of dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_extract_pi16
+#define _mm_extract_pi16(a, imm) \
+    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
+
+// Inserts the least significant 16 bits of b into the selected 16-bit integer
+// of a.
+// https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx
+// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
+//                                       __constrange(0,8) int imm)
+#define _mm_insert_epi16(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s16(                                     \
+            vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \
+    })
+
+// Copy a to dst, and insert the 16-bit integer i into dst at the location
+// specified by imm8.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16
+#define _mm_insert_pi16(a, b, imm)                               \
+    __extension__({                                              \
+        vreinterpret_m64_s16(                                    \
+            vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 32-bit integer from a and zero
+// extends.
+// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
+#define _mm_extract_epi32(a, imm) \
+    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
+
+// Extracts the selected single-precision (32-bit) floating-point from a.
+// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
+#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
+
+// Inserts the least significant 32 bits of b into the selected 32-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
+//                                       __constrange(0,4) int imm)
+#define _mm_insert_epi32(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s32(                                     \
+            vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \
+    })
+
+// Extracts the selected signed or unsigned 64-bit integer from a and zero
+// extends.
+// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
+#define _mm_extract_epi64(a, imm) \
+    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
+
+// Inserts the least significant 64 bits of b into the selected 64-bit integer
+// of a.
+// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
+//                                       __constrange(0,2) int imm)
+#define _mm_insert_epi64(a, b, imm)                                  \
+    __extension__({                                                  \
+        vreinterpretq_m128i_s64(                                     \
+            vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \
+    })
+
+// Count the number of bits set to 1 in unsigned 32-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32
+FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcount)
+    return __builtin_popcount(a);
+#else
+    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
+#endif
+#else
+    uint32_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+
+    vst1_u32(&count, count32x2_val);
+    return count;
+#endif
+}
+
+// Count the number of bits set to 1 in unsigned 64-bit integer a, and
+// return that count in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64
+FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
+{
+#if defined(__aarch64__)
+#if __has_builtin(__builtin_popcountll)
+    return __builtin_popcountll(a);
+#else
+    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
+#endif
+#else
+    uint64_t count = 0;
+    uint8x8_t input_val, count8x8_val;
+    uint16x4_t count16x4_val;
+    uint32x2_t count32x2_val;
+    uint64x1_t count64x1_val;
+
+    input_val = vld1_u8((uint8_t *) &a);
+    count8x8_val = vcnt_u8(input_val);
+    count16x4_val = vpaddl_u8(count8x8_val);
+    count32x2_val = vpaddl_u16(count16x4_val);
+    count64x1_val = vpaddl_u32(count32x2_val);
+    vst1_u64(&count, count64x1_val);
+    return count;
+#endif
+}
+
+// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
+// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
+// transposed matrix in these vectors (row0 now contains column 0, etc.).
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS
+#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
+    do {                                                  \
+        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
+        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
+        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
+                            vget_low_f32(ROW23.val[0]));  \
+        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
+                            vget_low_f32(ROW23.val[1]));  \
+        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
+                            vget_high_f32(ROW23.val[0])); \
+        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
+                            vget_high_f32(ROW23.val[1])); \
+    } while (0)
+
+/* Crypto Extensions */
+
+#if defined(__ARM_FEATURE_CRYPTO)
+// Wraps vmull_p64
+FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
+    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
+    return vreinterpretq_u64_p128(vmull_p64(a, b));
+}
+#else  // ARMv7 polyfill
+// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
+//
+// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
+// 64-bit->128-bit polynomial multiply.
+//
+// It needs some work and is somewhat slow, but it is still faster than all
+// known scalar methods.
+//
+// Algorithm adapted to C from
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
+// from "Fast Software Polynomial Multiplication on ARM Processors Using the
+// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
+// (https://hal.inria.fr/hal-01506572)
+static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
+{
+    poly8x8_t a = vreinterpret_p8_u64(_a);
+    poly8x8_t b = vreinterpret_p8_u64(_b);
+
+    // Masks
+    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
+                                    vcreate_u8(0x00000000ffffffff));
+    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
+                                    vcreate_u8(0x0000000000000000));
+
+    // Do the multiplies, rotating with vext to get all combinations
+    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
+    uint8x16_t e =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
+    uint8x16_t f =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
+    uint8x16_t g =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
+    uint8x16_t h =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
+    uint8x16_t i =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
+    uint8x16_t j =
+        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
+    uint8x16_t k =
+        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
+
+    // Add cross products
+    uint8x16_t l = veorq_u8(e, f);  // L = E + F
+    uint8x16_t m = veorq_u8(g, h);  // M = G + H
+    uint8x16_t n = veorq_u8(i, j);  // N = I + J
+
+    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
+    // instructions.
+#if defined(__aarch64__)
+    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
+    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
+        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
+        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
+#else
+    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
+    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
+    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
+    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
+#endif
+    // t0 = (L) (P0 + P1) << 8
+    // t1 = (M) (P2 + P3) << 16
+    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
+    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
+    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
+
+    // t2 = (N) (P4 + P5) << 24
+    // t3 = (K) (P6 + P7) << 32
+    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
+    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
+    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
+
+    // De-interleave
+#if defined(__aarch64__)
+    uint8x16_t t0 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t1 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
+    uint8x16_t t2 = vreinterpretq_u8_u64(
+        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+    uint8x16_t t3 = vreinterpretq_u8_u64(
+        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
+#else
+    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
+    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
+    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
+    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
+#endif
+    // Shift the cross products
+    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
+    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
+    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
+    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
+
+    // Accumulate the products
+    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
+    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
+    uint8x16_t mix = veorq_u8(d, cross1);
+    uint8x16_t r = veorq_u8(mix, cross2);
+    return vreinterpretq_u64_u8(r);
+}
+#endif  // ARMv7 polyfill
+
+// Perform a carry-less multiplication of two 64-bit integers, selected from a
+// and b according to imm8, and store the results in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128
+FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
+{
+    uint64x2_t a = vreinterpretq_u64_m128i(_a);
+    uint64x2_t b = vreinterpretq_u64_m128i(_b);
+    switch (imm & 0x11) {
+    case 0x00:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
+    case 0x01:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
+    case 0x10:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
+    case 0x11:
+        return vreinterpretq_m128i_u64(
+            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
+    default:
+        abort();
+    }
+}
+
+#if !defined(__ARM_FEATURE_CRYPTO)
+/* clang-format off */
+#define SSE2NEON_AES_DATA(w)                                           \
+    {                                                                  \
+        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
+        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
+        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
+        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
+        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
+        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
+        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
+        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
+        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
+        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
+        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
+        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
+        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
+        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
+        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
+        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
+        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
+        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
+        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
+        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
+        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
+        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
+        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
+        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
+        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
+        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
+        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
+        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
+        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
+        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
+        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
+        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
+        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
+        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
+        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
+        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
+        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
+    }
+/* clang-format on */
+
+/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
+#define SSE2NEON_AES_H0(x) (x)
+static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0);
+#undef SSE2NEON_AES_H0
+
+// In the absence of crypto extensions, implement aesenc using regular neon
+// intrinsics instead. See:
+// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
+// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
+// https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52
+// for more information Reproduced with permission of the author.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey)
+{
+#if defined(__aarch64__)
+    static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9,
+                                         0xe, 0x3, 0x8, 0xd, 0x2, 0x7,
+                                         0xc, 0x1, 0x6, 0xb};
+    static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
+                                       0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc};
+
+    uint8x16_t v;
+    uint8x16_t w = vreinterpretq_u8_m128i(EncBlock);
+
+    // shift rows
+    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
+
+    // sub bytes
+    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80);
+    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0);
+
+    // mix columns
+    w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b);
+    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
+    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
+
+    //  add round key
+    return vreinterpretq_m128i_u8(w) ^ RoundKey;
+
+#else /* ARMv7-A NEON implementation */
+#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                                       \
+    (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \
+     (b0))
+#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
+#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
+#define SSE2NEON_AES_U0(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
+#define SSE2NEON_AES_U1(p) \
+    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
+#define SSE2NEON_AES_U2(p) \
+    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
+#define SSE2NEON_AES_U3(p) \
+    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
+    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U0),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U1),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U2),
+        SSE2NEON_AES_DATA(SSE2NEON_AES_U3),
+    };
+#undef SSE2NEON_AES_B2W
+#undef SSE2NEON_AES_F2
+#undef SSE2NEON_AES_F3
+#undef SSE2NEON_AES_U0
+#undef SSE2NEON_AES_U1
+#undef SSE2NEON_AES_U2
+#undef SSE2NEON_AES_U3
+
+    uint32_t x0 = _mm_cvtsi128_si32(EncBlock);
+    uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55));
+    uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA));
+    uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
+         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
+        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
+         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
+        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
+         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
+        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
+         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, RoundKey);
+#endif
+}
+
+// Perform the last round of an AES encryption flow on data (state) in a using
+// the round key in RoundKey, and store the result in dst.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    /* FIXME: optimized for NEON */
+    uint8_t v[4][4] = {
+        [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]},
+        [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]},
+        [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]},
+        [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)],
+               SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]},
+    };
+    for (int i = 0; i < 16; i++)
+        vreinterpretq_nth_u8_m128i(a, i) =
+            v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i);
+    return a;
+}
+
+// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
+// This instruction generates a round key for AES encryption. See
+// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
+// for details.
+//
+// https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon)
+{
+    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55));
+    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF));
+    for (int i = 0; i < 4; ++i) {
+        ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]];
+        ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]];
+    }
+    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
+                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
+}
+#undef SSE2NEON_AES_DATA
+
+#else /* __ARM_FEATURE_CRYPTO */
+// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
+// AESMC and then manually applying the real key as an xor operation. This
+// unfortunately means an additional xor op; the compiler should be able to
+// optimize this away for repeated calls however. See
+// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
+// for more details.
+FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
+{
+    return vreinterpretq_m128i_u8(
+        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^
+        vreinterpretq_u8_m128i(b));
+}
+
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128
+FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
+{
+    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
+                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
+                         RoundKey);
+}
+
+FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
+{
+    // AESE does ShiftRows and SubBytes on A
+    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
+
+    uint8x16_t dest = {
+        // Undo ShiftRows step from AESE and extract X1 and X3
+        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
+        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
+        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
+        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
+    };
+    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
+    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
+}
+#endif
+
+/* Streaming Extensions */
+
+// Guarantees that every preceding store is globally visible before any
+// subsequent store.
+// https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_sfence(void)
+{
+    __sync_synchronize();
+}
+
+// Store 64-bits of integer data from a into memory using a non-temporal memory
+// hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pi
+FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
+{
+    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
+}
+
+// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
+// point elements) from a into memory using a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps
+FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#else
+    vst1q_f32(p, vreinterpretq_f32_m128(a));
+#endif
+}
+
+// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
+// elements) from a into memory using a non-temporal memory hint. mem_addr must
+// be aligned on a 16-byte boundary or a general-protection exception may be
+// generated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_pd
+FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, (float32x4_t *) p);
+#elif defined(__aarch64__)
+    vst1q_f64(p, vreinterpretq_f64_m128d(a));
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
+#endif
+}
+
+// Stores the data in a to the address p without polluting the caches.  If the
+// cache line containing address p is already in the cache, the cache will be
+// updated.
+// https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx
+FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    __builtin_nontemporal_store(a, p);
+#else
+    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
+#endif
+}
+
+// Store 32-bit integer a into memory using a non-temporal hint to minimize
+// cache pollution. If the cache line containing address mem_addr is already in
+// the cache, the cache will be updated.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_si32
+FORCE_INLINE void _mm_stream_si32(int *p, int a)
+{
+    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
+}
+
+// Load 128-bits of integer data from memory into dst using a non-temporal
+// memory hint. mem_addr must be aligned on a 16-byte boundary or a
+// general-protection exception may be generated.
+//
+//   dst[127:0] := MEM[mem_addr+127:mem_addr]
+//
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128
+FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
+{
+#if __has_builtin(__builtin_nontemporal_store)
+    return __builtin_nontemporal_load(p);
+#else
+    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
+#endif
+}
+
+// Cache line containing p is flushed and invalidated from all caches in the
+// coherency domain. :
+// https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx
+FORCE_INLINE void _mm_clflush(void const *p)
+{
+    (void) p;
+    // no corollary for Neon?
+}
+
+// Allocate aligned blocks of memory.
+// https://software.intel.com/en-us/
+//         cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks
+FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
+{
+    void *ptr;
+    if (align == 1)
+        return malloc(size);
+    if (align == 2 || (sizeof(void *) == 8 && align == 4))
+        align = sizeof(void *);
+    if (!posix_memalign(&ptr, align, size))
+        return ptr;
+    return NULL;
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_maskmove_si64
+FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
+{
+    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
+    __m128 b = _mm_load_ps((const float *) mem_addr);
+    int8x8_t masked =
+        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
+                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
+    vst1_s8((int8_t *) mem_addr, masked);
+}
+
+// Conditionally store 8-bit integer elements from a into memory using mask
+// (elements are not stored when the highest bit is not set in the corresponding
+// element) and a non-temporal memory hint.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_maskmovq
+#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
+
+// Free aligned memory that was allocated with _mm_malloc.
+// https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free
+FORCE_INLINE void _mm_free(void *addr)
+{
+    free(addr);
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 8-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc ^= v;
+    for (int bit = 0; bit < 8; bit++) {
+        if (crc & 1)
+            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
+        else
+            crc = (crc >> 1);
+    }
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 16-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u8(crc, v & 0xff);
+    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 32-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100)
+FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u16(crc, v & 0xffff);
+    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
+#endif
+    return crc;
+}
+
+// Starting with the initial value in crc, accumulates a CRC32 value for
+// unsigned 64-bit integer v.
+// https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100)
+FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
+{
+#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
+    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
+                         : [c] "+r"(crc)
+                         : [v] "r"(v));
+#else
+    crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff);
+    crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff);
+#endif
+    return crc;
+}
+
+FORCE_INLINE void _mm_empty (void) { }
+
+#if defined(__GNUC__) || defined(__clang__)
+#pragma pop_macro("ALIGN_STRUCT")
+#pragma pop_macro("FORCE_INLINE")
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__)
+#pragma GCC pop_options
+#endif
+
+#endif
diff --git a/dedicated/wscript b/dedicated/wscript
new file mode 100755
index 00000000..485ef288
--- /dev/null
+++ b/dedicated/wscript
@@ -0,0 +1,68 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+from waflib import Utils
+import os
+
+top = '.'
+PROJECT_NAME = 'dedicated'
+
+def options(opt):
+	# stub
+	return
+
+def configure(conf):
+	conf.define('LAUNCHERONLY',1)
+#	conf.define('SUPPORT_PACKED_STORE',1)
+	conf.define('DEDICATED',1)
+
+def build(bld):
+	source = [
+		'filesystem.cpp',
+		'../public/filesystem_init.cpp',
+		'../common/netapi.cpp',
+		'../common/SteamAppStartup.cpp',
+		'sys_common.cpp',
+		'sys_ded.cpp',
+		#'sys_windows.cpp', [$WINDOWS]
+		'sys_linux.cpp', # [$POSIX]
+		'console/conproc.cpp',
+		'console/textconsole.cpp',
+		'console/TextConsoleUnix.cpp', # [$POSIX]
+		'../filesystem/filetracker.cpp',
+		'../filesystem/basefilesystem.cpp',
+		'../filesystem/packfile.cpp',
+		'../filesystem/filesystem_async.cpp',
+		'../filesystem/filesystem_stdio.cpp',
+		'../filesystem/QueuedLoader.cpp',
+		'../public/zip_utils.cpp',
+		'../filesystem/linux_support.cpp' # [$POSIX]
+	]
+
+	includes = [
+		'.',
+		'../public',
+		'../public/tier0',
+		'../public/tier1',
+		'../common'
+	] + bld.env.INCLUDES_SDL2
+
+	defines = []
+
+	libs = ['tier0','tier1','tier2','tier3','vstdlib','steam_api','vpklib','appframework','mathlib', 'EDIT']
+
+	install_path = bld.env.LIBDIR
+
+	bld.shlib(
+		source   = source,
+		target   = PROJECT_NAME,
+		name     = PROJECT_NAME,
+		features = 'c cxx',
+		includes = includes,
+		defines  = defines,
+		use      = libs,
+		install_path = install_path,
+		subsystem = bld.env.MSVC_SUBSYSTEM,
+		idx      = bld.get_taskgen_count()
+	)
+
diff --git a/dedicated_main/main.cpp b/dedicated_main/main.cpp
index 9c64d7aa..de75a373 100644
--- a/dedicated_main/main.cpp
+++ b/dedicated_main/main.cpp
@@ -209,6 +209,9 @@ int main( int argc, char *argv[] )
 	const char *pBinaryName = "dedicated" DLL_EXT_STRING;
 
 	void *dedicated = dlopen( pBinaryName, RTLD_NOW );
+	if ( !dedicated )
+		dedicated = dlopen( "libdedicated" DLL_EXT_STRING, RTLD_NOW );
+
 	if ( !dedicated )
 	{
 		printf( "Failed to open %s (%s)\n", pBinaryName, dlerror());
diff --git a/dedicated_main/wscript b/dedicated_main/wscript
new file mode 100755
index 00000000..97a5f994
--- /dev/null
+++ b/dedicated_main/wscript
@@ -0,0 +1,43 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+from waflib import Utils
+import os
+
+top = '.'
+PROJECT_NAME = 'dedicated_launcher'
+
+def options(opt):
+	# stub
+	return
+
+def configure(conf):
+	return
+
+def build(bld):
+
+	source = ['main.cpp']
+	includes = ['../public', '../public/tier0']
+	defines = []
+	libs = []
+
+	if bld.env.DEST_OS != 'win32':
+		libs += [ 'DL' ]
+	else:
+		libs += ['USER32', 'SHELL32']
+		source += ['dedicated_main.rc']
+
+	install_path = bld.env.BINDIR
+	bld(
+		source   = source,
+		target   = PROJECT_NAME,
+		name     = PROJECT_NAME,
+		features = 'c cxx cxxprogram',
+		includes = includes,
+		defines  = defines,
+		use      = libs,
+		install_path = install_path,
+		subsystem = bld.env.MSVC_SUBSYSTEM,
+		idx      = bld.get_taskgen_count()
+	)
+
diff --git a/engine/sys_dll2.cpp b/engine/sys_dll2.cpp
index 30fbe4ae..e6742211 100644
--- a/engine/sys_dll2.cpp
+++ b/engine/sys_dll2.cpp
@@ -1175,7 +1175,7 @@ InitReturnVal_t CEngineAPI::Init()
 	m_bRunningSimulation = false;
 
 	// Initialize the FPU control word
-#if defined(WIN32) && !defined( SWDS ) && !defined( _X360 )
+#if defined(WIN32) && !defined( SWDS ) && !defined( _X360 ) && !defined (__arm__)
 	_asm
 	{
 		fninit
diff --git a/engine/sys_engine.cpp b/engine/sys_engine.cpp
index c75abcff..d9083778 100644
--- a/engine/sys_engine.cpp
+++ b/engine/sys_engine.cpp
@@ -39,6 +39,9 @@
 // memdbgon must be the last include file in a .cpp file!!!
 #include "tier0/memdbgon.h"
 
+#ifdef POSIX
+#include <signal.h>
+#endif
 
 //-----------------------------------------------------------------------------
 // Forward declarations
@@ -366,7 +369,11 @@ void CEngine::Frame( void )
 				for (int i = 2000; i >= 0; --i)
 				{
 #if defined(POSIX)
+#ifdef __arm__
+					raise(SIGINT);
+#else
 					__asm( "pause" ); __asm( "pause" ); __asm( "pause" ); __asm( "pause" );
+#endif
 #elif defined(IS_WINDOWS_PC)
 					_asm { pause }; _asm { pause }; _asm { pause }; _asm { pause };
 #endif
diff --git a/engine/wscript b/engine/wscript
index ce4dc481..2881101e 100755
--- a/engine/wscript
+++ b/engine/wscript
@@ -12,6 +12,10 @@ def options(opt):
 	return
 
 def configure(conf):
+	if conf.options.DEDICATED:
+		conf.define('SWDS', 1)
+		conf.define('NO_STEAM', 1)
+
 	conf.env.append_unique('DEFINES',[
 		'__USEA3D',
 		'_ADD_EAX_',
@@ -22,50 +26,18 @@ def configure(conf):
 
 def build(bld):
 	source = [
-		'client_pch.cpp', ##[!$DEDICATED]
-		'cl_rcon.cpp', ##[!$DEDICATED]
 		'socketcreator.cpp',
-		'rpt_engine.cpp', ##[!$DEDICATED]
-		'cl_steamauth.cpp',# #[!$DEDICATED]
 		'clientframe.cpp',
 		'decal_clip.cpp',
 		'demofile.cpp',
 		'DevShotGenerator.cpp',
 		'OcclusionSystem.cpp',
 		'tmessage.cpp',
-		'r_efx.cpp',# #[!$DEDICATED] 
-		'view.cpp', ##[!$DEDICATED]
 		'baseclient.cpp',
 		'baseclientstate.cpp',
 		'cbenchmark.cpp',
-		'cdll_engine_int.cpp', ##[!$DEDICATED]	/
-		'cl_main.cpp',##[!$DEDICATED]	/
-		'cl_demo.cpp',##[!$DEDICATED]	/
-		#'cl_null.cpp',	[$DEDICATED]	/
-		'cl_demoaction.cpp',	#[!$DEDICATED]	/
-		'cl_demoaction_types.cpp',  	#[!$DEDICATED]	/
-		'cl_demoactioneditors.cpp',	#[!$DEDICATED] 	/
-		'cl_demoactionmanager.cpp',  #[!$DEDICATED]	/
-		'cl_demoeditorpanel.cpp',  	#[!$DEDICATED]	/
-		'cl_demosmootherpanel.cpp', 	#[!$DEDICATED]	/
-		'cl_demouipanel.cpp',  #[!$DEDICATED]	/
-		'cl_foguipanel.cpp',  #[!$DEDICATED]	/
-		'cl_txviewpanel.cpp',	 	#[!$DEDICATED]	/
-		'cl_entityreport.cpp',	 	#[!$DEDICATED]	/
-		'cl_ents_parse.cpp',	#[!$DEDICATED]	/
-		'cl_localnetworkbackdoor.cpp', #[!$DEDICATED] 	/
-		'cl_parse_event.cpp',	 	#[!$DEDICATED]	/
-		'cl_pluginhelpers.cpp',#[!$DEDICATED]	/
-		'cl_pred.cpp', #[!$DEDICATED]	/
-		'cl_texturelistpanel.cpp',	#[!$DEDICATED]	/
-		'client.cpp',#[!$DEDICATED]	/
-		'colorcorrectionpanel.cpp',	#[!$DEDICATED] 	/
 		'console.cpp',
 		'render_pch.cpp',
-
-		'buildcubemaps.cpp',	#[!$DEDICATED]	/
-		'debug_leafvis.cpp',	#[!$DEDICATED]	/
-		'debugoverlay.cpp',	#[!$DEDICATED]	/
 		'decals.cpp',
 		'disp.cpp',
 		'disp_interface.cpp',
@@ -73,32 +45,19 @@ def build(bld):
 		'gl_draw.cpp',
 		'gl_rsurf.cpp',
 		'gl_shader.cpp',
-		'gl_drawlights.cpp',	#[!$DEDICATED]	/
-		'gl_lightmap.cpp',	#[!$DEDICATED]	/
-		'gl_matsysiface.cpp', #[!$DEDICATED]	/
-		'gl_rlight.cpp',#[!$DEDICATED]	/
-		'gl_rmain.cpp',#[!$DEDICATED]	/
-		'gl_rmisc.cpp',#[!$DEDICATED]	/
-		'gl_screen.cpp',#[!$DEDICATED]	/
-		'gl_warp.cpp',#[!$DEDICATED]	/
 		'l_studio.cpp',
 		'matsys_interface.cpp',
 		'modelloader.cpp',
 		'Overlay.cpp',
-		'r_areaportal.cpp',	#[!$DEDICATED]	/
 		'r_decal.cpp',
 		'r_linefile.cpp',
-		'shadowmgr.cpp',#[!$DEDICATED]
 		'server_pch.cpp',
-
-
 		'sv_ipratelimit.cpp',
 		'sv_rcon.cpp',
 		'sv_steamauth.cpp',
 		'sv_uploaddata.cpp',
 		'sv_uploadgamestats.cpp',
 		'vengineserver_impl.cpp',
-
 		'sv_main.cpp',
 		'sv_client.cpp',
 		'sv_ents_write.cpp',
@@ -110,7 +69,6 @@ def build(bld):
 		'sv_precache.cpp',
 		'sv_redirect.cpp',
 		'sv_remoteaccess.cpp',
-
 		'baseautocompletefilelist.cpp',
 		'baseserver.cpp',
 		'bitbuf_errorhandler.cpp',
@@ -121,7 +79,6 @@ def build(bld):
 		'checksum_engine.cpp',
 		'ccs.cpp',
 		'clockdriftmgr.cpp',
-		'cl_bounded_cvars.cpp', ##[!$DEDICATED]
 		'cl_check_process.cpp',	
 		'cmd.cpp',
 		'cmodel.cpp',
@@ -137,7 +94,6 @@ def build(bld):
 		'../public/disp_powerinfo.cpp',
 		'../public/dispcoll_common.cpp',
 		'DownloadListGenerator.cpp',
-		'downloadthread.cpp', ##[!$DEDICATED]
 		'dt.cpp',
 		'dt_common_eng.cpp',
 		'dt_encode.cpp',
@@ -178,14 +134,8 @@ def build(bld):
 		'initmathlib.cpp',
 		'../common/language.cpp',
 		'LocalNetworkBackdoor.cpp',
-		'LoadScreenUpdate.cpp', #[!$DEDICATED]
 		'../public/lumpfiles.cpp',
 		'MapReslistGenerator.cpp',
-		'matchmakinghost.cpp', #[!$DEDICATED] 
-		'matchmakingqos.cpp', #[!$DEDICATED]
-		'matchmakingclient.cpp', #[!$DEDICATED]
-		'matchmakingshared.cpp', #[!$DEDICATED]
-		'matchmakingmigrate.cpp', #[!$DEDICATED]
 		'materialproxyfactory.cpp',
 		'mem_fgets.cpp',
 		'mod_vis.cpp',
@@ -211,10 +161,8 @@ def build(bld):
 		'engine_replay_int.cpp',
 		'replay_internal.cpp',
 		'replaydemo.cpp',
-		'replaydemoplayer.cpp',	#[!$DEDICATED]
 		'replayserver.cpp',
 		'../public/sentence.cpp',
-		'Session.cpp', #[!$DEDICATED]
 		'sound_shared.cpp',
 		'spatialpartition.cpp',
 		'staticpropmgr.cpp',
@@ -222,7 +170,6 @@ def build(bld):
 		'sys_dll.cpp',
 		'sys_dll2.cpp',
 		'sys_engine.cpp',
-		'sys_mainwind.cpp', #[!$DEDICATED]
 		'sys_linuxwind.cpp', #[$POSIX]
 		'testscriptmgr.cpp',
 		'traceinit.cpp',
@@ -234,56 +181,22 @@ def build(bld):
 		'../public/XZip.cpp',
 		'../public/XUnzip.cpp',
 		'zone.cpp',
-		'bugreporter.cpp', #[!$DEDICATED]
 		'cheatcodes.cpp',
-		'download.cpp', #[!$DEDICATED]
 		'../public/editor_sendcommand.cpp',
-		'host_saverestore.cpp', #[!$DEDICATED]
 		'keys.cpp',
-		'lightcache.cpp', #[!$DEDICATED] 
 		'networkstringtableclient.cpp',
-		'saverestore_filesystem.cpp', #[!$DEDICATED]
 		'../public/scratchpad3d.cpp',
 		'servermsghandler.cpp',
-		'sys_getmodes.cpp', #[!$DEDICATED]
-		'vgui_askconnectpanel.cpp', #[!$DEDICATED]
-		'xboxsystem.cpp', #[!$DEDICATED]
 		'../common/SourceAppInfo.cpp',
 		'snd_io.cpp',
 		'EngineSoundServer.cpp',
-		'EngineSoundClient.cpp', #[!$DEDICATED]
-		'engsoundservice.cpp', #[!$DEDICATED]
 		'audio/private/voice_wavefile.cpp',
-		'audio/private/MPAFile.cpp', #[!$DEDICATED&&!$X360]
-		'audio/private/MPAHeader.cpp', #[!$DEDICATED&&!$X360]
-		'audio/private/circularbuffer.cpp', #[!$DEDICATED]
-		'audio/private/snd_posix.cpp',	#	[$POSIX]	
-
-		'audio/audio_pch.cpp', #[!$DEDICATED]
-			
 		'audio/private/vox.cpp',
-		'audio/private/snd_dev_common.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_dma.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_dsp.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_mix.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_sentence_mixer.cpp',	#[!$DEDICATED]/
-		'audio/private/snd_wave_data.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_wave_mixer.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_wave_mixer_adpcm.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_wave_source.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_wave_temp.cpp',	#[!$DEDICATED]	/
-		'audio/private/snd_win.cpp',	#[!$DEDICATED]	/
-		'audio/private/voice_gain.cpp',	
-
+		'audio/private/snd_posix.cpp',	#	[$POSIX]	
 		#'audio/private/snd_dev_direct.cpp',	[$WINDOWS]/
 		#'audio/private/snd_dev_wave.cpp',	[$WINDOWS]/
-		'audio/private/snd_mp3_source.cpp',	#[!$DEDICATED]/
-		'audio/private/snd_wave_mixer_mp3.cpp',	#[!$DEDICATED]	/
-		'audio/private/VBRHeader.cpp',	#	[!$DEDICATED&&!$X360]/
-		'audio/private/voice.cpp',	#[!$DEDICATED&&!$X360]/
 		#'audio/private/voice_mixer_controls.cpp', [$WINDOWS]	/
 		#'audio/private/voice_record_dsound.cpp',	[$WINDOWS]	/
-		'audio/private/voice_sound_engine_interface.cpp',	#[!$DEDICATED&&!$X360]
 
 		#'audio/private/snd_dev_xaudio.cpp',[$X360]
 		#'audio/private/snd_wave_mixer_xma.cpp',	[$X360]
@@ -291,29 +204,116 @@ def build(bld):
 		'audio/private/snd_dev_sdl.cpp',	#[$SDL && !$OSXALL]
 		#'audio/private/snd_dev_openal.cpp', # [$OSXALL]
 		#'audio/private/snd_dev_mac_audioqueue.cpp',#	[$OSXALL]
-		'audio/private/voice_mixer_controls_openal.cpp',	#[$OSXALL||($LINUXALL&&!$DEDICATED)]
-		'audio/private/voice_record_openal.cpp',	#[$OSXALL||($LINUXALL&&!$DEDICATED)]
 		#'audio/private/voice_record_mac_audioqueue.cpp',	#[$OSXALL]
-
-		'../public/vgui_controls/vgui_controls.cpp',
-		'../common/vgui/vgui_basebudgetpanel.cpp',
-		'../common/vgui/vgui_budgetbargraphpanel.cpp',
-		'../common/vgui/vgui_budgethistorypanel.cpp',
-		'../common/vgui/vgui_budgetpanelshared.cpp',
-		'perfuipanel.cpp',
-		'vgui_basepanel.cpp',
-		'vgui_baseui_interface.cpp',
-		'vgui_budgetpanel.cpp',
-		'vgui_DebugSystemPanel.cpp',
-		'vgui_drawtreepanel.cpp',
-		'vgui_helpers.cpp',
-		'vgui_texturebudgetpanel.cpp',
-		'vgui_vprofgraphpanel.cpp',
-		'vgui_vprofpanel.cpp',	
-		'enginetool.cpp', #[!$DEDICATED]
-		'toolframework.cpp'
 	]
 
+	if bld.env.DEDICATED:
+		source += ['cl_null.cpp']
+	else:
+		source += [
+			'client_pch.cpp',
+			'cl_rcon.cpp',
+			'r_efx.cpp',
+			'view.cpp',
+			'rpt_engine.cpp',
+			'cl_steamauth.cpp',
+			'cdll_engine_int.cpp',
+			'cl_main.cpp',
+			'cl_demo.cpp',
+			'cl_demoaction.cpp',
+			'cl_demoaction_types.cpp',
+			'cl_demoactioneditors.cpp',
+			'cl_demoactionmanager.cpp',
+			'cl_demoeditorpanel.cpp',
+			'cl_demosmootherpanel.cpp',
+			'cl_demouipanel.cpp',
+			'cl_foguipanel.cpp',
+			'cl_txviewpanel.cpp',
+			'cl_entityreport.cpp',
+			'cl_ents_parse.cpp',
+			'cl_localnetworkbackdoor.cpp',
+			'cl_parse_event.cpp',
+			'cl_pluginhelpers.cpp',
+			'cl_pred.cpp',
+			'cl_texturelistpanel.cpp',
+			'client.cpp',
+			'colorcorrectionpanel.cpp',
+			'buildcubemaps.cpp',
+			'debug_leafvis.cpp',
+			'debugoverlay.cpp',
+			'LoadScreenUpdate.cpp',
+			'enginetool.cpp',
+			'gl_drawlights.cpp',
+			'gl_lightmap.cpp',
+			'gl_matsysiface.cpp',
+			'gl_rlight.cpp',
+			'gl_rmain.cpp',
+			'gl_rmisc.cpp',
+			'gl_screen.cpp',
+			'gl_warp.cpp',
+			'r_areaportal.cpp',
+			'shadowmgr.cpp',
+			'cl_bounded_cvars.cpp',
+			'downloadthread.cpp',
+			'matchmakinghost.cpp',
+			'matchmakingqos.cpp',
+			'matchmakingclient.cpp',
+			'matchmakingshared.cpp',
+			'matchmakingmigrate.cpp',
+			'replaydemoplayer.cpp',
+			'Session.cpp',
+			'sys_mainwind.cpp',
+			'bugreporter.cpp',
+			'download.cpp',
+			'host_saverestore.cpp',
+			'lightcache.cpp',
+			'saverestore_filesystem.cpp',
+			'sys_getmodes.cpp',
+			'vgui_askconnectpanel.cpp',
+			'xboxsystem.cpp',
+			'audio/audio_pch.cpp',
+			'EngineSoundClient.cpp',
+			'engsoundservice.cpp',
+			'audio/private/MPAFile.cpp', #[!$X360]
+			'audio/private/MPAHeader.cpp', #[!$X360]
+			'audio/private/circularbuffer.cpp',
+			'audio/private/snd_dev_common.cpp',
+			'audio/private/snd_dma.cpp',
+			'audio/private/snd_dsp.cpp',
+			'audio/private/snd_mix.cpp',
+			'audio/private/snd_sentence_mixer.cpp',
+			'audio/private/snd_wave_data.cpp',
+			'audio/private/snd_wave_mixer.cpp',
+			'audio/private/snd_wave_mixer_adpcm.cpp',
+			'audio/private/snd_wave_source.cpp',
+			'audio/private/snd_wave_temp.cpp',
+			'audio/private/snd_win.cpp',
+			'audio/private/voice_gain.cpp',	
+			'audio/private/snd_mp3_source.cpp',
+			'audio/private/snd_wave_mixer_mp3.cpp',
+			'audio/private/VBRHeader.cpp', #[!$X360]
+			'audio/private/voice.cpp', #[!$X360]
+			'audio/private/voice_sound_engine_interface.cpp', #[!$X360]
+			'audio/private/voice_mixer_controls_openal.cpp', #[$OSXALL||$LINUXALL]
+			'audio/private/voice_record_openal.cpp', #[$OSXALL||$LINUXALL]
+			'../public/vgui_controls/vgui_controls.cpp',
+			'../common/vgui/vgui_basebudgetpanel.cpp',
+			'../common/vgui/vgui_budgetbargraphpanel.cpp',
+			'../common/vgui/vgui_budgethistorypanel.cpp',
+			'../common/vgui/vgui_budgetpanelshared.cpp',
+			'perfuipanel.cpp',
+			'vgui_basepanel.cpp',
+			'vgui_baseui_interface.cpp',
+			'vgui_budgetpanel.cpp',
+			'vgui_DebugSystemPanel.cpp',
+			'vgui_drawtreepanel.cpp',
+			'vgui_helpers.cpp',
+			'vgui_texturebudgetpanel.cpp',
+			'vgui_vprofgraphpanel.cpp',
+			'vgui_vprofpanel.cpp',	
+			'toolframework.cpp'
+		]
+
 	includes = [
 		'.',
 		'../public',
@@ -327,7 +327,7 @@ def build(bld):
 
 	defines = []
 
-	libs = ['tier0','vgui_controls','dmxloader','tier1','tier2','tier3','bitmap','vstdlib','appframework','datamodel','vtf','mathlib','steam_api','matsys_controls','BZIP2','SDL2','JPEG','ZLIB','OPENAL','CURL']
+	libs = ['tier0','vgui_controls','dmxloader','tier1','tier2','tier3','bitmap','vstdlib','appframework','datamodel','vtf','mathlib','steam_api','matsys_controls','BZ2','SDL2','JPEG','ZLIB','OPENAL','CURL']
 
 	install_path = bld.env.LIBDIR
 
diff --git a/filesystem/basefilesystem.cpp b/filesystem/basefilesystem.cpp
index b154b70e..4423cb4c 100644
--- a/filesystem/basefilesystem.cpp
+++ b/filesystem/basefilesystem.cpp
@@ -5054,7 +5054,6 @@ CSysModule *CBaseFileSystem::LoadModule( const char *pFileName, const char *pPat
 
 #ifdef POSIX
 		Q_snprintf( tempPathID, sizeof(tempPathID), "%slib%s", m_SearchPaths[i].GetPathString(), pFileName ); // append the path to this dir.
-		printf(tempPathID);
 		pModule = Sys_LoadModule( tempPathID );
 		if ( pModule )
 			return pModule;
diff --git a/inputsystem/wscript b/inputsystem/wscript
index 3ff6305d..fe090fbf 100755
--- a/inputsystem/wscript
+++ b/inputsystem/wscript
@@ -27,9 +27,8 @@ def build(bld):
 		'.',
 		'../common',
 		'../public',
-		'../public/tier0',
-		'../thirdparty/SDL2'
-	]	
+		'../public/tier0'
+	]
 
 	defines = []
 
diff --git a/launcher_main/main.cpp b/launcher_main/main.cpp
index 22d1a779..6e20ba50 100644
--- a/launcher_main/main.cpp
+++ b/launcher_main/main.cpp
@@ -216,12 +216,13 @@ static void WaitForDebuggerConnect( int argc, char *argv[], int time )
 int main( int argc, char *argv[] )
 {
 	void *launcher = dlopen( "bin/liblauncher" DLL_EXT_STRING, RTLD_NOW );
+	fprintf( stderr, "%s\nFailed to load the launcher\n", dlerror() );
 	if( !launcher )
-		void *launcher = dlopen( "bin/launcher" DLL_EXT_STRING, RTLD_NOW );
+		launcher = dlopen( "bin/launcher" DLL_EXT_STRING, RTLD_NOW );
 
 	if ( !launcher )
 	{
-		fprintf( stderr, "Failed to load the launcher\n" );
+		fprintf( stderr, "%s\nFailed to load the launcher\n", dlerror() );
 		return 0;
 	}
 	
diff --git a/materialsystem/shaderapiempty/wscript b/materialsystem/shaderapiempty/wscript
new file mode 100755
index 00000000..3dab5e41
--- /dev/null
+++ b/materialsystem/shaderapiempty/wscript
@@ -0,0 +1,52 @@
+#! /usr/bin/env python
+# encoding: utf-8
+
+from waflib import Utils
+import os
+
+top = '.'
+PROJECT_NAME = 'shaderapiempty'
+
+def options(opt):
+	# stub
+	return
+
+def configure(conf):
+	conf.env.append_unique('DEFINES',[
+		'SHADER_DLL_EXPORT',
+		'PROTECTED_THINGS_ENABLE'
+	])
+
+def build(bld):
+	source = [
+		'shaderapiempty.cpp'
+	]
+
+	includes = [
+		'.',
+		'../../public',
+		'../../public/tier0',
+		'../../public/tier1',
+		'../../common',
+		'../'
+	] + bld.env.INCLUDES_SDL2
+
+	defines = []
+
+	libs = ['tier0','tier1']
+
+	install_path = bld.env.LIBDIR
+
+	bld.shlib(
+		source   = source,
+		target   = PROJECT_NAME,
+		name     = PROJECT_NAME,
+		features = 'c cxx',
+		includes = includes,
+		defines  = defines,
+		use      = libs,
+		install_path = install_path,
+		subsystem = bld.env.MSVC_SUBSYSTEM,
+		idx      = bld.get_taskgen_count()
+	)
+
diff --git a/mathlib/sse.cpp b/mathlib/sse.cpp
index 018a7a5b..83dda7d9 100644
--- a/mathlib/sse.cpp
+++ b/mathlib/sse.cpp
@@ -11,6 +11,10 @@
 #include "tier0/dbg.h"
 #include "mathlib/mathlib.h"
 #include "mathlib/vector.h"
+#ifdef __arm__
+#include "sse2neon.h"
+#endif
+
 #include "sse.h"
 
 // memdbgon must be the last include file in a .cpp file!!!
@@ -176,7 +180,9 @@ float _SSE_RSqrtFast(float x)
 	Assert( s_bMathlibInitialized );
 
 	float rroot;
-#ifdef _WIN32
+#ifdef __arm__
+        rroot = _SSE_RSqrtAccurate(x);
+#elif _WIN32
 	_asm
 	{
 		rsqrtss	xmm0, x
@@ -204,16 +210,19 @@ float FASTCALL _SSE_VectorNormalize (Vector& vec)
 #endif
 
 	float *v = &vec[0];
-#ifdef _WIN32
 	float *r = &result[0];
-#endif
 
 	float	radius = 0.f;
 	// Blah, get rid of these comparisons ... in reality, if you have all 3 as zero, it shouldn't 
 	// be much of a performance win, considering you will very likely miss 3 branch predicts in a row.
 	if ( v[0] || v[1] || v[2] )
 	{
-#ifdef _WIN32
+#ifdef __arm__
+		float rsqrt = _SSE_RSqrtAccurate( v[0] * v[0] + v[1] * v[1] + v[2] * v[2] );
+		r[0] = v[0] * rsqrt;
+		r[1] = v[1] * rsqrt;
+		r[2] = v[2] * rsqrt;
+#elif _WIN32
 	_asm
 		{
 			mov			eax, v
@@ -287,7 +296,9 @@ void FASTCALL _SSE_VectorNormalizeFast (Vector& vec)
 float _SSE_InvRSquared(const float* v)
 {
 	float	inv_r2 = 1.f;
-#ifdef _WIN32
+#ifdef __arm__
+	return _SSE_RSqrtAccurate( FLT_EPSILON + v[0] * v[0] + v[1] * v[1] + v[2] * v[2] );
+#elif _WIN32
 	_asm { // Intel SSE only routine
 		mov			eax, v
 		movss		xmm5, inv_r2		// x5 = 1.0, 0, 0, 0
@@ -380,7 +391,14 @@ typedef __m64 v2si;   // vector of 2 int (mmx)
 
 void _SSE_SinCos(float x, float* s, float* c)
 {
-#ifdef _WIN32
+#ifdef __arm__
+#if defined( POSIX )
+        sincosf(x, s, c);
+#else
+	*s = sin( x );
+        *c = cos( x );
+#endif
+#elif _WIN32
 	float t4, t8, t12;
 
 	__asm
@@ -587,7 +605,9 @@ void _SSE_SinCos(float x, float* s, float* c)
 
 float _SSE_cos( float x )
 {
-#ifdef _WIN32
+#ifdef __arm__
+	return cos(x);
+#elif _WIN32
 	float temp;
 	__asm
 	{
diff --git a/public/bitmap/imageformat.h b/public/bitmap/imageformat.h
index 196397e0..3a982514 100644
--- a/public/bitmap/imageformat.h
+++ b/public/bitmap/imageformat.h
@@ -28,7 +28,9 @@ typedef enum _D3DFORMAT D3DFORMAT;
 //-----------------------------------------------------------------------------
 
 // don't bitch that inline functions aren't used!!!!
+#ifdef _WIN32
 #pragma warning(disable : 4514)
+#endif
 
 enum ImageFormat 
 {
diff --git a/public/materialsystem/imesh.h b/public/materialsystem/imesh.h
index 3da443b4..5f23f24e 100644
--- a/public/materialsystem/imesh.h
+++ b/public/materialsystem/imesh.h
@@ -1156,6 +1156,9 @@ inline void CVertexBuilder::FastAdvanceNVertices( int n )
 //-----------------------------------------------------------------------------
 inline void CVertexBuilder::FastVertex( const ModelVertexDX7_t &vertex )
 {
+#ifdef __arm__
+	FastVertexSSE( vertex );
+#else
 	Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
 	Assert( m_nCurrentVertex < m_nMaxVertexCount );
 
@@ -1213,6 +1216,7 @@ inline void CVertexBuilder::FastVertex( const ModelVertexDX7_t &vertex )
 	m_bWrittenNormal   = false;
 	m_bWrittenUserData = false;
 #endif
+#endif
 }
 
 inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX7_t &vertex )
@@ -1322,6 +1326,9 @@ inline void CVertexBuilder::Fast4VerticesSSE(
 
 inline void CVertexBuilder::FastVertex( const ModelVertexDX8_t &vertex )
 {
+#ifdef __arm__
+	FastVertexSSE( vertex );
+#else
 	Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
 	Assert( m_nCurrentVertex < m_nMaxVertexCount );
 
@@ -1386,8 +1393,10 @@ inline void CVertexBuilder::FastVertex( const ModelVertexDX8_t &vertex )
 	m_bWrittenNormal   = false;
 	m_bWrittenUserData = false;
 #endif
+#endif
 }
 
+
 inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex )
 {
 	Assert( m_CompressionType == VERTEX_COMPRESSION_NONE ); // FIXME: support compressed verts if needed
@@ -1414,7 +1423,8 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex )
 #elif defined(GNUC)
 	const void *pRead = &vertex;
 	void *pCurrPos = m_pCurrPosition;
-	__asm__ __volatile__ (
+
+/*	__asm__ __volatile__ (
 						  "movaps (%0), %%xmm0\n"
 						  "movaps 16(%0), %%xmm1\n"
 						  "movaps 32(%0), %%xmm2\n"
@@ -1422,8 +1432,17 @@ inline void CVertexBuilder::FastVertexSSE( const ModelVertexDX8_t &vertex )
 						  "movntps %%xmm0, (%1)\n"
 						  "movntps %%xmm1, 16(%1)\n"
 						  "movntps %%xmm2, 32(%1)\n"
-						  "movntps %%xmm3, 48(%1)\n"						  
-						  :: "r" (pRead), "r" (pCurrPos) : "memory");
+						  "movntps %%xmm3, 48(%1)\n"
+						  :: "r" (pRead), "r" (pCurrPos) : "memory"); */
+
+	__m128 m1 = _mm_load_ps( (float *)pRead );
+	__m128 m2 = _mm_load_ps( (float *)(pRead + 16) );
+	__m128 m3 = _mm_load_ps( (float *)(pRead + 32) );
+	__m128 m4 = _mm_load_ps( (float *)(pRead + 48) );
+	_mm_stream_ps( (float *)pCurrPos, m1 );
+	_mm_stream_ps( (float *)(pCurrPos + 16), m2 );
+	_mm_stream_ps( (float *)(pCurrPos + 32), m3 );
+	_mm_stream_ps( (float *)(pCurrPos + 48), m4 );
 #else
 	Error( "Implement CMeshBuilder::FastVertexSSE((dx8)" );
 #endif
diff --git a/public/mathlib/mathlib.h b/public/mathlib/mathlib.h
index a6d302ff..fe103e5e 100644
--- a/public/mathlib/mathlib.h
+++ b/public/mathlib/mathlib.h
@@ -458,11 +458,7 @@ void inline SinCos( float radians, float *sine, float *cosine )
 	*sine = sin( radians );
 	*cosine = cos( radians );
 #elif defined( POSIX )
-	double __cosr, __sinr;
-	__asm ("fsincos" : "=t" (__cosr), "=u" (__sinr) : "0" (radians));
-
-  	*sine = __sinr;
-  	*cosine = __cosr;
+	sincosf(radians, sine, cosine);
 #endif
 }
 
@@ -1217,6 +1213,8 @@ FORCEINLINE int RoundFloatToInt(float f)
 	};
 	flResult = __fctiw( f );
 	return pResult[1];
+#elif defined (__arm__)
+        return (int)(f + 0.5f);
 #else
 #error Unknown architecture
 #endif
@@ -1247,8 +1245,9 @@ FORCEINLINE unsigned long RoundFloatToUnsignedLong(float f)
 	Assert( pIntResult[1] >= 0 );
 	return pResult[1];
 #else  // !X360
-	
-#if defined( PLATFORM_WINDOWS_PC64 )
+#ifdef __arm__
+        return (unsigned long)(f + 0.5f);
+#elif defined( PLATFORM_WINDOWS_PC64 )
 	uint nRet = ( uint ) f;
 	if ( nRet & 1 )
 	{
diff --git a/public/mathlib/ssemath.h b/public/mathlib/ssemath.h
index c2ff48d7..d5617c4b 100644
--- a/public/mathlib/ssemath.h
+++ b/public/mathlib/ssemath.h
@@ -8,6 +8,8 @@
 
 #if defined( _X360 )
 #include <xboxmath.h>
+#elif defined(__arm__)
+#include "sse2neon.h"
 #else
 #include <xmmintrin.h>
 #endif
@@ -21,7 +23,7 @@
 #define USE_STDC_FOR_SIMD 0
 #endif
 
-#if (!defined(_X360) && (USE_STDC_FOR_SIMD == 0))
+#if (!defined (__arm__) && !defined(_X360) && (USE_STDC_FOR_SIMD == 0))
 #define _SSE1 1
 #endif
 
diff --git a/public/mathlib/vector.h b/public/mathlib/vector.h
index c7654ba8..c763a3e1 100644
--- a/public/mathlib/vector.h
+++ b/public/mathlib/vector.h
@@ -22,7 +22,8 @@
 // For rand(). We really need a library!
 #include <stdlib.h>
 
-#ifndef _X360
+#if defined(__SSE__) || defined(_M_IX86_FP)
+#define USE_SSE
 // For MMX intrinsics
 #include <xmmintrin.h>
 #endif
@@ -209,10 +210,9 @@ private:
 
 FORCEINLINE void NetworkVarConstruct( Vector &v ) { v.Zero(); }
 
-
-#define USE_M64S ( ( !defined( _X360 ) ) )
-
-
+#ifdef USE_SSE
+#define USE_M64S
+#endif
 
 //=========================================================
 // 4D Short Vector (aligned on 8-byte boundary)
diff --git a/public/mathlib/vector4d.h b/public/mathlib/vector4d.h
index 2b20c882..d63cf52b 100644
--- a/public/mathlib/vector4d.h
+++ b/public/mathlib/vector4d.h
@@ -16,7 +16,7 @@
 #include <math.h>
 #include <stdlib.h>		// For rand(). We really need a library!
 #include <float.h>
-#if !defined( _X360 )
+#if defined(__SSE__) || defined(_M_IX86_FP)
 #include <xmmintrin.h>	// For SSE
 #endif
 #include "basetypes.h"	// For vec_t, put this somewhere else?
@@ -141,8 +141,10 @@ public:
 	inline void Set( vec_t X, vec_t Y, vec_t Z, vec_t W );
 	inline void InitZero( void );
 
+#ifndef __arm__
 	inline __m128 &AsM128() { return *(__m128*)&x; }
 	inline const __m128 &AsM128() const { return *(const __m128*)&x; } 
+#endif
 
 private:
 	// No copy constructors allowed if we're in optimal mode
@@ -613,8 +615,10 @@ inline void Vector4DAligned::Set( vec_t X, vec_t Y, vec_t Z, vec_t W )
 }
 
 inline void Vector4DAligned::InitZero( void )
-{ 
-#if !defined( _X360 )
+{
+#if defined (__arm__)
+	x = y = z = w = 0;
+#elif !defined( _X360 )
 	this->AsM128() = _mm_set1_ps( 0.0f );
 #else
 	this->AsM128() = __vspltisw( 0 );
@@ -625,7 +629,7 @@ inline void Vector4DAligned::InitZero( void )
 inline void Vector4DMultiplyAligned( Vector4DAligned const& a, Vector4DAligned const& b, Vector4DAligned& c )
 {
 	Assert( a.IsValid() && b.IsValid() );
-#if !defined( _X360 )
+#if !defined( _X360 ) || defined (__arm__)
 	c.x = a.x * b.x;
 	c.y = a.y * b.y;
 	c.z = a.z * b.z;
@@ -639,7 +643,7 @@ inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAli
 {
 	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
 
-#if !defined( _X360 )
+#if !defined( _X360 ) || defined (__arm__)
 	vOutA.x += vInA.x * w;
 	vOutA.y += vInA.y * w;
 	vOutA.z += vInA.z * w;
@@ -660,6 +664,7 @@ inline void Vector4DWeightMAD( vec_t w, Vector4DAligned const& vInA, Vector4DAli
 #endif
 }
 
+#ifndef __arm__
 inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4DAligned& vOutA, Vector4DAligned const& vInB, Vector4DAligned& vOutB )
 {
 	Assert( vInA.IsValid() && vInB.IsValid() && IsFinite(w) );
@@ -681,6 +686,7 @@ inline void Vector4DWeightMADSSE( vec_t w, Vector4DAligned const& vInA, Vector4D
 	vOutB.AsM128() = __vmaddfp( vInB.AsM128(), temp, vOutB.AsM128() );
 #endif
 }
+#endif
 
 #endif // VECTOR4D_H
 
diff --git a/public/saverestoretypes.h b/public/saverestoretypes.h
index 491a96e4..da06298e 100644
--- a/public/saverestoretypes.h
+++ b/public/saverestoretypes.h
@@ -512,27 +512,25 @@ inline const char *CSaveRestoreSegment::StringFromSymbol( int token )
 ///             compilers. Either way, there's no portable intrinsic.
 
 // Newer GCC versions provide this in this header, older did by default.
-#if !defined( _rotr ) && defined( COMPILER_GCC )
+#if !defined( _rotr ) && defined( COMPILER_GCC ) && !defined( __arm__ )
 #include <x86intrin.h>
 #endif
 
-#ifdef COMPILER_CLANG
-static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__))
-_rotr(unsigned int _Value, int _Shift) {
-	_Shift &= 0x1f;
-	return _Shift ? (_Value >> _Shift) | (_Value << (32 - _Shift)) : _Value;
+#if !defined ( _rotr )
+inline unsigned _rotr(unsigned x, unsigned n) {
+	 return (x >> n % 32) | (x << (32-n) % 32);
 }
 #endif
 
-
 inline unsigned int CSaveRestoreSegment::HashString( const char *pszToken )
 {
 	COMPILE_TIME_ASSERT( sizeof( unsigned int ) == 4 );
 	unsigned int	hash = 0;
 
 	while ( *pszToken )
+	{
 		hash = _rotr( hash, 4 ) ^ *pszToken++;
-
+	}
 	return hash;
 }
 
diff --git a/public/tier0/platform.h b/public/tier0/platform.h
index 63871391..3da32c7c 100644
--- a/public/tier0/platform.h
+++ b/public/tier0/platform.h
@@ -713,7 +713,7 @@ typedef void * HINSTANCE;
 
 
 // When we port to 64 bit, we'll have to resolve the int, ptr vs size_t 32/64 bit problems...
-#if !defined( _WIN64 )
+#if !defined( _WIN64 ) && defined( _WIN32 )
 #pragma warning( disable : 4267 )	// conversion from 'size_t' to 'int', possible loss of data
 #pragma warning( disable : 4311 )	// pointer truncation from 'char *' to 'int'
 #pragma warning( disable : 4312 )	// conversion from 'unsigned int' to 'memhandle_t' of greater size
@@ -825,9 +825,9 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT)
 
 		#endif
 	#endif
-
+#elif defined (__arm__)
+	inline void SetupFPUControlWord() {}
 #else
-
 	inline void SetupFPUControlWord()
 	{
 		__volatile unsigned short int __cw;
@@ -849,7 +849,7 @@ static FORCEINLINE double fsel(double fComparand, double fValGE, double fLT)
 			{
 				double flResult;
 				int pResult[2];
-			};
+			}
 			flResult = __fctiw( f );
 			return ( pResult[1] == 1 );
 		}
@@ -1160,7 +1160,11 @@ PLATFORM_INTERFACE struct tm *		Plat_localtime( const time_t *timep, struct tm *
 
 inline uint64 Plat_Rdtsc()
 {
-#if defined( _X360 )
+#if defined( __arm__ ) && defined (POSIX)
+	struct timespec t;
+	clock_gettime( CLOCK_REALTIME, &t);
+	return t.tv_sec * 1000000000ULL + t.tv_nsec;
+#elif defined( _X360 )
 	return ( uint64 )__mftb32();
 #elif defined( _WIN64 )
 	return ( uint64 )__rdtsc();
diff --git a/public/tier0/threadtools.h b/public/tier0/threadtools.h
index b0b9b1d2..f5328699 100644
--- a/public/tier0/threadtools.h
+++ b/public/tier0/threadtools.h
@@ -25,6 +25,7 @@
 #ifdef POSIX
 #include <pthread.h>
 #include <errno.h>
+#include <sched.h>
 #define WAIT_OBJECT_0 0
 #define WAIT_TIMEOUT 0x00000102
 #define WAIT_FAILED -1
@@ -141,9 +142,11 @@ inline void ThreadPause()
 #if defined( PLATFORM_WINDOWS_PC )
 	// Intrinsic for __asm pause; from <intrin.h>
 	_mm_pause();
-#elif POSIX
+#elif POSIX && defined( __i386__ )
 	__asm __volatile( "pause" );
 #elif defined( _X360 )
+#elif defined(__arm__)
+	sched_yield();
 #else
 #error "implement me"
 #endif
diff --git a/public/tier1/convar.h b/public/tier1/convar.h
index 5043b6bc..f9f95355 100644
--- a/public/tier1/convar.h
+++ b/public/tier1/convar.h
@@ -627,7 +627,9 @@ void ConVar_PrintDescription( const ConCommandBase *pVar );
 //-----------------------------------------------------------------------------
 // Purpose: Utility class to quickly allow ConCommands to call member methods
 //-----------------------------------------------------------------------------
+#ifdef _WIN32
 #pragma warning (disable : 4355 )
+#endif
 
 template< class T >
 class CConCommandMemberAccessor : public ConCommand, public ICommandCallback, public ICommandCompletionCallback
@@ -674,8 +676,9 @@ private:
 	FnMemberCommandCompletionCallback_t m_CompletionFunc;
 };
 
+#ifdef _WIN32
 #pragma warning ( default : 4355 )
-
+#endif
 
 //-----------------------------------------------------------------------------
 // Purpose: Utility macros to quicky generate a simple console command
diff --git a/public/tier1/utlblockmemory.h b/public/tier1/utlblockmemory.h
index e7e06aeb..35ef2da2 100644
--- a/public/tier1/utlblockmemory.h
+++ b/public/tier1/utlblockmemory.h
@@ -21,8 +21,10 @@
 #include "tier0/memalloc.h"
 #include "tier0/memdbgon.h"
 
+#ifdef _WIN32
 #pragma warning (disable:4100)
 #pragma warning (disable:4514)
+#endif
 
 //-----------------------------------------------------------------------------
 
diff --git a/public/tier1/utlfixedmemory.h b/public/tier1/utlfixedmemory.h
index 6ff8c19b..c051bdae 100644
--- a/public/tier1/utlfixedmemory.h
+++ b/public/tier1/utlfixedmemory.h
@@ -20,8 +20,10 @@
 #include "tier0/memalloc.h"
 #include "tier0/memdbgon.h"
 
+#ifdef _WIN32
 #pragma warning (disable:4100)
 #pragma warning (disable:4514)
+#endif
 
 //-----------------------------------------------------------------------------
 
diff --git a/public/tier1/utllinkedlist.h b/public/tier1/utllinkedlist.h
index 37ecef6b..dda8162d 100644
--- a/public/tier1/utllinkedlist.h
+++ b/public/tier1/utllinkedlist.h
@@ -550,8 +550,10 @@ inline I  CUtlLinkedList<T,S,ML,I,M>::PrivateNext( I i ) const
 // Are nodes in the list or valid?
 //-----------------------------------------------------------------------------
 
+#ifdef _WIN32
 #pragma warning(push)
 #pragma warning( disable: 4310 ) // Allows "(I)(S)M::INVALID_INDEX" below
+#endif
 template <class T, class S, bool ML, class I, class M>
 inline bool CUtlLinkedList<T,S,ML,I,M>::IndexInRange( I index ) // Static method
 {
@@ -570,7 +572,9 @@ inline bool CUtlLinkedList<T,S,ML,I,M>::IndexInRange( I index ) // Static method
 
 	return ( ( (S)index == index ) && ( (S)index != InvalidIndex() ) );
 }
+#ifdef _WIN32
 #pragma warning(pop)
+#endif
 
 template <class T, class S, bool ML, class I, class M>
 inline bool CUtlLinkedList<T,S,ML,I,M>::IsValidIndex( I i ) const  
diff --git a/public/tier1/utlmemory.h b/public/tier1/utlmemory.h
index 9c5a1d14..66c1f28d 100644
--- a/public/tier1/utlmemory.h
+++ b/public/tier1/utlmemory.h
@@ -22,9 +22,10 @@
 #include "tier0/memalloc.h"
 #include "tier0/memdbgon.h"
 
+#ifdef _WIN32
 #pragma warning (disable:4100)
 #pragma warning (disable:4514)
-
+#endif
 
 //-----------------------------------------------------------------------------
 
diff --git a/public/tier1/utlrbtree.h b/public/tier1/utlrbtree.h
index 745c162d..3f8d27f8 100644
--- a/public/tier1/utlrbtree.h
+++ b/public/tier1/utlrbtree.h
@@ -664,8 +664,11 @@ inline void CUtlRBTree<T, I, L, M>::SetColor( I i, typename CUtlRBTree<T, I, L,
 //-----------------------------------------------------------------------------
 // Allocates/ deallocates nodes
 //-----------------------------------------------------------------------------
+#ifdef _WIN32
 #pragma warning(push)
 #pragma warning(disable:4389) // '==' : signed/unsigned mismatch
+#endif
+
 template < class T, class I, typename L, class M >
 I  CUtlRBTree<T, I, L, M>::NewNode()
 {
@@ -710,7 +713,9 @@ I  CUtlRBTree<T, I, L, M>::NewNode()
 
 	return elem;
 }
+#ifdef _WIN32
 #pragma warning(pop)
+#endif
 
 template < class T, class I, typename L, class M >
 void  CUtlRBTree<T, I, L, M>::FreeNode( I i )
diff --git a/public/tier1/utlvector.h b/public/tier1/utlvector.h
index 15b1e262..6fffc6a2 100644
--- a/public/tier1/utlvector.h
+++ b/public/tier1/utlvector.h
@@ -324,9 +324,12 @@ public:
 // Especialy useful if you have a lot of vectors that are sparse, or if you're
 // carefully packing holders of vectors
 //-----------------------------------------------------------------------------
+
+#ifdef _WIN32
 #pragma warning(push)
 #pragma warning(disable : 4200) // warning C4200: nonstandard extension used : zero-sized array in struct/union
 #pragma warning(disable : 4815 ) // warning C4815: 'staticData' : zero-sized array in stack object will have no elements
+#endif
 
 class CUtlVectorUltraConservativeAllocator
 {
@@ -573,7 +576,9 @@ private:
 	}
 };
 
+#ifdef _WIN32
 #pragma warning(pop)
+#endif
 
 // Make sure nobody adds multiple inheritance and makes this class bigger.
 COMPILE_TIME_ASSERT( sizeof(CUtlVectorUltraConservative<int>) == sizeof(void*) );
diff --git a/public/togl/linuxwin/glmgr.h b/public/togl/linuxwin/glmgr.h
index 0b056dca..7e76a682 100644
--- a/public/togl/linuxwin/glmgr.h
+++ b/public/togl/linuxwin/glmgr.h
@@ -214,8 +214,8 @@ struct GLClipPlaneEnable_t		{ GLint		enable;													inline bool operator==(
 struct GLClipPlaneEquation_t	{ GLfloat	x,y,z,w;												inline bool operator==(const GLClipPlaneEquation_t& src)	const { return EQ(x) && EQ(y) && EQ(z) && EQ(w);			} };
 
 //blend
-struct GLColorMaskSingle_t		{ char		r,g,b,a;												inline bool operator==(const GLColorMaskSingle_t& src)		const { return EQ(r) && EQ(g) && EQ(b) && EQ(a);			} };
-struct GLColorMaskMultiple_t	{ char		r,g,b,a;												inline bool operator==(const GLColorMaskMultiple_t& src)	const { return EQ(r) && EQ(g) && EQ(b) && EQ(a);			} };
+struct GLColorMaskSingle_t		{ signed char		r,g,b,a;												inline bool operator==(const GLColorMaskSingle_t& src)		const { return EQ(r) && EQ(g) && EQ(b) && EQ(a);			} };
+struct GLColorMaskMultiple_t	{ signed char		r,g,b,a;												inline bool operator==(const GLColorMaskMultiple_t& src)	const { return EQ(r) && EQ(g) && EQ(b) && EQ(a);			} };
 struct GLBlendEnable_t			{ GLint		enable;													inline bool operator==(const GLBlendEnable_t& src)			const { return EQ(enable);									} };
 struct GLBlendFactor_t			{ GLenum	srcfactor,dstfactor;									inline bool operator==(const GLBlendFactor_t& src)			const { return EQ(srcfactor) && EQ(dstfactor);				} };
 struct GLBlendEquation_t		{ GLenum	equation;												inline bool operator==(const GLBlendEquation_t& src)		const { return EQ(equation);								} };
@@ -225,7 +225,7 @@ struct GLBlendEnableSRGB_t		{ GLint		enable;													inline bool operator==(
 //depth
 struct GLDepthTestEnable_t		{ GLint		enable;													inline bool operator==(const GLDepthTestEnable_t& src)		const { return EQ(enable);									} };
 struct GLDepthFunc_t			{ GLenum	func;													inline bool operator==(const GLDepthFunc_t& src)			const { return EQ(func);									} };
-struct GLDepthMask_t			{ char		mask;													inline bool operator==(const GLDepthMask_t& src)			const { return EQ(mask);									} };
+struct GLDepthMask_t			{  char		mask;													inline bool operator==(const GLDepthMask_t& src)			const { return EQ(mask);									} };
 
 //stencil
 struct GLStencilTestEnable_t	{ GLint		enable;													inline bool operator==(const GLStencilTestEnable_t& src)	const { return EQ(enable);									} };
diff --git a/public/vgui/VGUI.h b/public/vgui/VGUI.h
index 34fddfeb..01557119 100644
--- a/public/vgui/VGUI.h
+++ b/public/vgui/VGUI.h
@@ -22,6 +22,7 @@
 #endif
 #endif
 
+#ifdef _WIN32
 #pragma warning( disable: 4800 )	// disables 'performance warning converting int to bool'
 #pragma warning( disable: 4786 )	// disables 'identifier truncated in browser information' warning
 #pragma warning( disable: 4355 )	// disables 'this' : used in base member initializer list
@@ -29,6 +30,7 @@
 #pragma warning( disable: 4514 )	// warning C4514: 'Color::Color' : unreferenced inline function has been removed
 #pragma warning( disable: 4100 )	// warning C4100: 'code' : unreferenced formal parameter
 #pragma warning( disable: 4127 )	// warning C4127: conditional expression is constant
+#endif
 
 typedef unsigned char  uchar;
 typedef unsigned short ushort;
diff --git a/public/vstdlib/pch_vstdlib.h b/public/vstdlib/pch_vstdlib.h
index 57242c41..bef1b7f1 100644
--- a/public/vstdlib/pch_vstdlib.h
+++ b/public/vstdlib/pch_vstdlib.h
@@ -11,8 +11,9 @@
 // $NoKeywords: $
 //=============================================================================
 
-
+#ifdef _WIN32
 #pragma warning(disable: 4514)
+#endif
 
 // First include standard libraries
 #include <stdio.h>
diff --git a/public/vstdlib/random.h b/public/vstdlib/random.h
index fdfd09c6..6909a107 100644
--- a/public/vstdlib/random.h
+++ b/public/vstdlib/random.h
@@ -16,8 +16,10 @@
 
 #define NTAB 32
 
+#ifdef _WIN32
 #pragma warning(push)
 #pragma warning( disable:4251 )
+#endif
 
 //-----------------------------------------------------------------------------
 // A generator of uniformly distributed random numbers
@@ -114,8 +116,9 @@ public:
 //-----------------------------------------------------------------------------
 VSTDLIB_INTERFACE void	InstallUniformRandomStream( IUniformRandomStream *pStream );
 
-
+#ifdef _WIN32
 #pragma warning(pop)
+#endif
 
 #endif // VSTDLIB_RANDOM_H
 
diff --git a/tier0/cpu.cpp b/tier0/cpu.cpp
index 63c89d27..cf6fd787 100644
--- a/tier0/cpu.cpp
+++ b/tier0/cpu.cpp
@@ -22,7 +22,9 @@ const tchar* GetProcessorVendorId();
 
 static bool cpuid(unsigned long function, unsigned long& out_eax, unsigned long& out_ebx, unsigned long& out_ecx, unsigned long& out_edx)
 {
-#if defined(GNUC)
+#if defined (__arm__) || defined( _X360 )
+	return false;
+#elif defined(GNUC)
 	asm("mov %%ebx, %%esi\n\t"
 		"cpuid\n\t"
 		"xchg %%esi, %%ebx"
@@ -30,11 +32,9 @@ static bool cpuid(unsigned long function, unsigned long& out_eax, unsigned long&
 		"=S" (out_ebx),
 		"=c" (out_ecx),
 		"=d" (out_edx)
-		: "a" (function) 
+		: "a" (function)
 		);
 	return true;
-#elif defined( _X360 )
-	return false;
 #elif defined(_WIN64)
 	int pCPUInfo[4];
 	__cpuid( pCPUInfo, (int)function );
@@ -142,21 +142,21 @@ static bool IsWin98OrOlder()
 
 static bool CheckSSETechnology(void)
 {
-#if defined( _X360 ) || defined( _PS3 )
+#if defined( __ARM__ )
+	return false;
+#elif defined( _X360 ) || defined( _PS3 )
 	return true;
 #else
-	if ( IsWin98OrOlder() )
-	{
+	if ( IsWin98OrOlder() ) {
 		return false;
 	}
 
-    unsigned long eax,ebx,edx,unused;
-    if ( !cpuid(1,eax,ebx,unused,edx) )
-	{
+	unsigned long eax,ebx,edx,unused;
+	if ( !cpuid(1,eax,ebx,unused,edx) ) {
 		return false;
 	}
 
-    return ( edx & 0x2000000L ) != 0;
+	return ( edx & 0x2000000L ) != 0;
 #endif
 }
 
diff --git a/tier0/cpu_posix.cpp b/tier0/cpu_posix.cpp
index 994a8ab0..ebd3dc9a 100644
--- a/tier0/cpu_posix.cpp
+++ b/tier0/cpu_posix.cpp
@@ -106,7 +106,7 @@ uint64 CalculateCPUFreq()
 		uint64 retVal = 1000000;
 		return retVal * atoi( pFreq );
 	}
-#endif
+#else
 
 	// Try to open cpuinfo_max_freq. If the kernel was built with cpu scaling support disabled, this will fail.
 	FILE *fp = fopen( "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq", "r" );
@@ -177,5 +177,6 @@ uint64 CalculateCPUFreq()
 	}
 
 	return period;
+#endif
 }
 
diff --git a/tier1/pathmatch.cpp b/tier1/pathmatch.cpp
index f828a9ae..bd8d3cbd 100644
--- a/tier1/pathmatch.cpp
+++ b/tier1/pathmatch.cpp
@@ -57,6 +57,7 @@
 #include <string>
 #include <time.h>
 
+
 // Enable to do pathmatch caching. Beware: this code isn't threadsafe.
 // #define DO_PATHMATCH_CACHE
 
@@ -66,7 +67,16 @@
 
 static bool s_bShowDiag;
 #define DEBUG_MSG( ... ) if ( s_bShowDiag ) fprintf( stderr, ##__VA_ARGS__ )
+
+#ifdef POSIX
+#include <signal.h>
+#define DEBUG_BREAK() raise(SIGINT)
+#elif !defined (__arm__)
 #define DEBUG_BREAK() __asm__ __volatile__ ( "int $3" )
+#else
+#define DEBUG_BREAK() 
+#endif
+
 #define _COMPILE_TIME_ASSERT(pred) switch(0){case 0:case pred:;}
 
 #define WRAP( fn, ret, ... ) \
diff --git a/tier1/processor_detect_linux.cpp b/tier1/processor_detect_linux.cpp
index c7c95d04..189fef8a 100644
--- a/tier1/processor_detect_linux.cpp
+++ b/tier1/processor_detect_linux.cpp
@@ -6,6 +6,14 @@
 // $NoKeywords: $
 //=============================================================================//
 
+
+#if defined (__arm__)
+bool CheckMMXTechnology(void) { return false; }
+bool CheckSSETechnology(void) { return false; }
+bool CheckSSE2Technology(void) { return false; }
+bool Check3DNowTechnology(void) { return false; }
+#else
+
 #define cpuid(in,a,b,c,d)												\
 	asm("pushl %%ebx\n\t" "cpuid\n\t" "movl %%ebx,%%esi\n\t" "pop %%ebx": "=a" (a), "=S" (b), "=c" (c), "=d" (d) : "a" (in));
 
@@ -45,3 +53,5 @@ bool Check3DNowTechnology(void)
     }
     return false;
 }
+
+#endif
diff --git a/tier1/reliabletimer.cpp b/tier1/reliabletimer.cpp
index af8b8421..ab46596f 100644
--- a/tier1/reliabletimer.cpp
+++ b/tier1/reliabletimer.cpp
@@ -13,6 +13,10 @@ bool CReliableTimer::sm_bUseQPC = false;
 #include "winlite.h"
 #endif
 
+#ifdef POSIX
+#include <time.h>
+#endif
+
 //-----------------------------------------------------------------------------
 // Purpose: Constructor
 //-----------------------------------------------------------------------------
@@ -83,6 +87,10 @@ int64 CReliableTimer::GetPerformanceCountNow()
 	uint64 ulNow;
 	SYS_TIMEBASE_GET( ulNow );
 	return ulNow;
+#elif defined( __arm__ ) && defined (POSIX)
+	struct timespec ts;
+	clock_gettime(CLOCK_REALTIME, &ts);
+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 #else
 	uint64 un64;
 	 __asm__ __volatile__ (
diff --git a/tier1/strtools.cpp b/tier1/strtools.cpp
index a63fa4fc..7f2fd270 100644
--- a/tier1/strtools.cpp
+++ b/tier1/strtools.cpp
@@ -3337,8 +3337,9 @@ const Tier1FullHTMLEntity_t g_Tier1_FullHTMLEntities[] =
 	{ L'\u00FF', "&yuml;", 6 },
 	{ 0, NULL, 0 } // sentinel for end of array
 };
+#ifdef _WIN32
 #pragma warning( pop )
-
+#endif
 
 
 bool V_BasicHtmlEntityEncode( char *pDest, const int nDestSize, char const *pIn, const int nInSize, bool bPreserveWhitespace /*= false*/ )
diff --git a/tier1/wscript b/tier1/wscript
index 0df2dc4f..c8a8673e 100755
--- a/tier1/wscript
+++ b/tier1/wscript
@@ -68,7 +68,8 @@ def build(bld):
 		'.',
 		'../public',
 		'../public/tier1',
-		'../public/tier0'
+		'../public/tier0',
+		'../common'
 	]
 
 	defines = []
diff --git a/vstdlib/coroutine.cpp b/vstdlib/coroutine.cpp
index 5cabe938..a1bcfc1d 100644
--- a/vstdlib/coroutine.cpp
+++ b/vstdlib/coroutine.cpp
@@ -223,11 +223,11 @@ extern "C" byte *GetStackPtr64();
 // Apple's version of gcc/g++ doesn't return the expected value using the intrinsic, so 
 // do it the old fashioned way - this will also use asm on linux (since we don't compile
 // with llvm/clang there) but that seems fine.
-#if defined(__llvm__) || defined(__clang__)
+//#if defined(__llvm__) || defined(__clang__)
 #define GetStackPtr( pStackPtr )	byte *pStackPtr = (byte*)__builtin_frame_address(0)
-#else
-#define GetStackPtr( pStackPtr )	register byte *pStackPtr __asm__( "esp" )
-#endif
+//#else
+//#define GetStackPtr( pStackPtr )	register byte *pStackPtr __asm__( "esp" )
+//#endif
 #elif defined(__SNC__)
 #define GetStackPtr( pStackPtr )	byte *pStackPtr = (byte*)__builtin_frame_address(0)
 #else
diff --git a/wscript b/wscript
index f408f343..fc5bfd2e 100644
--- a/wscript
+++ b/wscript
@@ -7,7 +7,7 @@ from waflib import Logs, Context, Configure
 import sys
 import os
 
-VERSION = '0.99'
+VERSION = '1.0'
 APPNAME = 'source-engine'
 top = '.'
 
@@ -28,24 +28,91 @@ int main() { return (int)FcInit(); }
 
 Context.Context.line_just = 55 # should fit for everything on 80x26
 
-projects=[
-	'tier0','tier1','tier2',
-	'vstdlib','vpklib','filesystem'
-	,'mathlib','tier3',
-	'bitmap','scenefilecache','datacache',
-	'launcher_main','vgui2/vgui_controls','vgui2/matsys_controls','vgui2/vgui_surfacelib',
-	'serverbrowser','soundemittersystem','vgui2/src',
-	'togl','vguimatsurface','vtf','materialsystem/shaderlib',
-	'materialsystem','studiorender','materialsystem/stdshaders',
-	'video','inputsystem','appframework',
-	'launcher','engine/voice_codecs/minimp3','materialsystem/shaderapidx9',
-	'gameui','dmxloader','datamodel','engine','ivp/havana',
-	'ivp/havana/havok/hk_math','ivp/havana/havok/hk_base',
-	'ivp/ivp_compact_builder','ivp/ivp_physics','vphysics','game/server',
-	'particles','choreoobjects','game/client'
-]
+projects={
+	'game': [
+		'appframework',
+		'bitmap',
+		'choreoobjects',
+		'datacache',
+		'datamodel',
+		'dmxloader',
+		'engine',
+		'engine/voice_codecs/minimp3',
+		'filesystem',
+		'game/client',
+		'game/server',
+		'gameui',
+		'inputsystem',
+		'ivp/havana',
+		'ivp/havana/havok/hk_base',
+		'ivp/havana/havok/hk_math',
+		'ivp/ivp_compact_builder',
+		'ivp/ivp_physics',
+		'launcher',
+		'launcher_main',
+		'materialsystem',
+		'materialsystem/shaderapidx9',
+		'materialsystem/shaderlib',
+		'materialsystem/stdshaders',
+		'mathlib',
+		'particles',
+		'scenefilecache',
+		'serverbrowser',
+		'soundemittersystem',
+		'studiorender',
+		'thirdparty/StubSteamAPI',
+		'tier0',
+		'tier1',
+		'tier2',
+		'tier3',
+		'togl',
+		'vgui2/matsys_controls',
+		'vgui2/src',
+		'vgui2/vgui_controls',
+		'vgui2/vgui_surfacelib',
+		'vguimatsurface',
+		'video',
+		'vphysics',
+		'vpklib',
+		'vstdlib',
+		'vtf'
+	],
+	'dedicated': [
+		'appframework',
+		'bitmap',
+		'choreoobjects',
+		'datacache',
+		'dedicated',
+		'dedicated_main',
+		'dmxloader',
+		'engine',
+		'game/server',
+		'ivp/havana',
+		'ivp/havana/havok/hk_base',
+		'ivp/havana/havok/hk_math',
+		'ivp/ivp_compact_builder',
+		'ivp/ivp_physics',
+		'materialsystem',
+		'mathlib',
+		'particles',
+		'scenefilecache',
+		'materialsystem/shaderapiempty',
+		'materialsystem/shaderlib',
+		'soundemittersystem',
+		'studiorender',
+		'tier0',
+		'tier1',
+		'tier2',
+		'tier3',
+		'vphysics',
+		'vpklib',
+		'vstdlib',
+		'vtf',
+		'thirdparty/StubSteamAPI'
+	]
+}
+
 
-projects += ['thirdparty/StubSteamAPI'] # ,'thirdparty/libjpeg','thirdparty/SDL2-src'] # thirdparty projects
 
 @Configure.conf
 def check_pkg(conf, package, uselib_store, fragment, *k, **kw):
@@ -70,7 +137,23 @@ def get_taskgen_count(self):
 	return idx
 
 def define_platform(conf):
-	conf.define('SOURCE1',1)
+	conf.env.DEDICATED = conf.options.DEDICATED
+
+	if conf.options.DEDICATED:
+		conf.options.SDL = False
+#		conf.options.GL = False
+		conf.define('DEDICATED', 1)
+
+	if conf.options.GL:
+		conf.env.append_unique('DEFINES', [
+			'DX_TO_GL_ABSTRACTION',
+			'GL_GLEXT_PROTOTYPES',
+			'BINK_VIDEO'
+		])
+
+	if conf.options.SDL:
+		conf.define('USE_SDL', 1)
+
 	if conf.env.DEST_OS == 'linux':
 		conf.define('_GLIBCXX_USE_CXX11_ABI',0)
 		conf.env.append_unique('DEFINES', [
@@ -79,24 +162,30 @@ def define_platform(conf):
 			'POSIX=1',
 			'_POSIX=1',
 			'GNUC',
-			'DX_TO_GL_ABSTRACTION',
-			'GL_GLEXT_PROTOTYPES',
-			'BINK_VIDEO',
-			'USE_SDL',
 			'NDEBUG',
 			'NO_HOOK_MALLOC',
 			'_DLL_EXT=.so'
 		])
 
+
 def options(opt):
 	grp = opt.add_option_group('Common options')
 
 	grp.add_option('-8', '--64bits', action = 'store_true', dest = 'ALLOW64', default = False,
 		help = 'allow targetting 64-bit engine(Linux/Windows/OSX x86 only) [default: %default]')
 
+	grp.add_option('-d', '--dedicated', action = 'store_true', dest = 'DEDICATED', default = False,
+		help = 'build dedicated server [default: %default]')
+
+	grp.add_option('--use-sdl', action = 'store', dest = 'SDL', type = 'int', default = True,
+		help = 'build engine with SDL [default: %default]')
+
+	grp.add_option('--use-togl', action = 'store', dest = 'GL', type = 'int', default = True,
+		help = 'build engine with ToGL [default: %default]')
+
 	opt.load('compiler_optimizations subproject')
 
-	opt.add_subproject(projects)
+#	opt.add_subproject(projects['game'])
 
 	opt.load('xcompile compiler_cxx compiler_c sdl2 clang_compilation_database strip_on_install waf_unit_test subproject')
 	if sys.platform == 'win32':
@@ -104,6 +193,7 @@ def options(opt):
 	opt.load('reconfigure')
 
 def configure(conf):
+
 	conf.load('fwgslib reconfigure')
 
 	# Force XP compability, all build targets should add
@@ -115,13 +205,16 @@ def configure(conf):
 		conf.load('msvc msvc_pdb msdev msvs')
 	conf.load('subproject xcompile compiler_c compiler_cxx gitversion clang_compilation_database strip_on_install waf_unit_test enforce_pic')
 
-	conf.check_cfg(package='sdl2', uselib_store='SDL2', args=['--cflags', '--libs'])
+	if conf.options.SDL:
+		conf.check_cfg(package='sdl2', uselib_store='SDL2', args=['--cflags', '--libs'])
+	if conf.options.DEDICATED:
+		conf.check_cfg(package='libedit', uselib_store='EDIT', args=['--cflags', '--libs'])
+
 	conf.check_cfg(package='libjpeg', uselib_store='JPEG', args=['--cflags', '--libs'])
 	conf.check_cfg(package='libpng', uselib_store='PNG', args=['--cflags', '--libs'])
 	conf.check_cfg(package='zlib', uselib_store='ZLIB', args=['--cflags', '--libs'])
 	conf.check_cfg(package='openal', uselib_store='OPENAL', args=['--cflags', '--libs'])
 	conf.check_cfg(package='libcurl', uselib_store='CURL', args=['--cflags', '--libs'])
-	conf.check_cfg(package='bzip2', uselib_store='BZIP2', args=['--cflags', '--libs'])
 	conf.check_pkg('freetype2', 'FT2', FT2_CHECK)
 	conf.check_pkg('fontconfig', 'FC', FC_CHECK)
 
@@ -150,8 +243,17 @@ def configure(conf):
 	]
 
 	cflags, linkflags = conf.get_optimization_flags()
-	cflags += ['-march=pentium4','-mtune=core2','-mfpmath=387']
-	linkflags += ['-march=pentium4','-mtune=core2','-mfpmath=387']
+
+	flags = ['-fPIC']
+
+	if conf.env.DEST_CPU == 'arm':
+		flags += ['-mfpu=neon']
+	else:
+		flags += ['-march=pentium4','-mtune=core2','-mfpmath=387']
+
+	cflags += flags
+	linkflags += flags
+
 	# And here C++ flags starts to be treated separately
 	cxxflags = list(cflags) + ['-std=c++11','-fpermissive']
 
@@ -180,9 +282,11 @@ def configure(conf):
 	conf.env.append_unique('CFLAGS', cflags)
 	conf.env.append_unique('CXXFLAGS', cxxflags)
 	conf.env.append_unique('LINKFLAGS', linkflags)
+	conf.env.append_unique('INCLUDES', [os.path.abspath('common/')])
 
 	if conf.env.DEST_OS != 'win32':
 		conf.check_cc(lib='dl', mandatory=False)
+		conf.check_cc(lib='bz2', mandatory=False)
 		conf.check_cc(lib='rt', mandatory=False)
 
 		if not conf.env.LIB_M: # HACK: already added in xcompile!
@@ -221,7 +325,14 @@ def configure(conf):
 		conf.env.LIBDIR = conf.env.BINDIR = conf.env.PREFIX
 
 	define_platform(conf)
-	conf.add_subproject(projects)
+
+	if conf.options.DEDICATED:
+		conf.add_subproject(projects['dedicated'])
+	else:
+		conf.add_subproject(projects['game'])
 
 def build(bld):
-	bld.add_subproject(projects)
+	if bld.env.DEDICATED:
+		bld.add_subproject(projects['dedicated'])
+	else:
+		bld.add_subproject(projects['game'])