core/vul/vul_url.cxx
Go to the documentation of this file.
00001 // This is core/vul/vul_url.cxx
00002 #ifdef VCL_NEEDS_PRAGMA_INTERFACE
00003 #pragma implementation
00004 #endif
00005 //:
00006 // \file
00007 // \author Ian Scott
00008 // Based on vil_stream_url by fsm
00009 // \verbatim
00010 //  Modifications
00011 //   8 Nov 2002 - Peter Vanroose - corrected HTTP client request syntax
00012 // \endverbatim
00013 
00014 #include "vul_url.h"
00015 #include <vcl_cstdio.h>  // sprintf()
00016 #include <vcl_cstring.h>
00017 #include <vcl_cstdlib.h>
00018 #include <vcl_sstream.h>
00019 #include <vcl_cassert.h>
00020 #include <vcl_fstream.h>
00021 #include <vul/vul_file.h>
00022 
00023 #if defined(unix) || defined(__unix) || defined(__unix__)
00024 
00025 # include <unistd.h>       // read(), write(), close()
00026 # include <netdb.h>        // gethostbyname(), sockaddr_in()
00027 # include <sys/socket.h>
00028 # include <netinet/in.h>   // htons()
00029 # ifdef __alpha
00030 #  include <fp.h>          // htons() [ on e.g. DEC alpha, htons is in machine/endian.h ]
00031 # endif
00032 # define SOCKET int
00033 
00034 #elif defined (VCL_WIN32) && !defined(__CYGWIN__)
00035 
00036 # include <winsock2.h>
00037 
00038 #endif // unix
00039 
00040 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00041 // So that we don't call WSAStartup more than we need to
00042 static int called_WSAStartup = 0;
00043 #endif
00044 
00045 //: only call this method with a correctly formatted http URL
00046 vcl_istream * vul_http_open(char const *url)
00047 {
00048   // split URL into auth, host, path and port number.
00049   vcl_string host;
00050   vcl_string path;
00051   vcl_string auth;
00052   int port = 80; // default
00053 
00054   // check it is an http URL.
00055   assert (vcl_strncmp(url, "http://", 7) == 0);
00056 
00057   char const *p = url + 7;
00058   while (*p && *p!='/')
00059     ++ p;
00060   host = vcl_string(url+7, p);
00061 
00062 
00063   if (*p)
00064     path = p+1;
00065   else
00066     path = "";
00067 
00068   //authentication
00069   for (unsigned int i=0; i<host.size(); ++i)
00070     if (host[i] == '@') {
00071       auth = vcl_string(host.c_str(), host.c_str()+i);
00072       host = vcl_string(host.c_str()+i+1, host.c_str() + host.size());
00073       break;
00074     }
00075 
00076   // port?
00077   for (unsigned int i=host.size()-1; i>0; --i)
00078     if (host[i] == ':') {
00079       port = vcl_atoi(host.c_str() + i + 1);
00080       host = vcl_string(host.c_str(), host.c_str() + i);
00081       break;
00082     }
00083 
00084   // do character translation
00085   unsigned k =0;
00086   while (k < path.size())
00087   {
00088     if (path[k] == ' ')
00089       path.replace(k, 1, "%20");
00090     else if (path[k] == '%')
00091       path.replace(k, 1, "%25");
00092     k++;
00093   }
00094 
00095   // so far so good.
00096 #ifdef DEBUG
00097   vcl_cerr << "auth = \'" << auth << "\'\n"
00098            << "host = \'" << host << "\'\n"
00099            << "path = \'" << path << "\'\n"
00100            << "port = " << port << vcl_endl;
00101 #endif
00102 
00103 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00104   if (called_WSAStartup==0)
00105   {
00106     WORD wVersionRequested;
00107     WSADATA wsaData;
00108 
00109     wVersionRequested = MAKEWORD( 2, 2 );
00110 
00111     /* int err = */ WSAStartup( wVersionRequested, &wsaData );
00112   }
00113 #endif
00114 
00115   // create socket endpoint.
00116   SOCKET tcp_socket = socket(PF_INET,      // IPv4 protocols.
00117                              SOCK_STREAM,  // two-way, reliable,
00118                                            // connection-based stream socket.
00119                              PF_UNSPEC);   // protocol number.
00120 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00121   if (tcp_socket == INVALID_SOCKET) {
00122 # ifndef NDEBUG
00123     vcl_cerr << __FILE__ "error code : " << WSAGetLastError() << '\n';
00124 # endif
00125 #else
00126   if (tcp_socket < 0) {
00127 #endif
00128     vcl_cerr << __FILE__ ": failed to create socket.\n";
00129     return 0;
00130   }
00131 
00132 #ifdef DEBUG
00133   vcl_cerr << __FILE__ ": tcp_socket = " << tcp_socket << '\n';
00134 #endif
00135 
00136   // get network address of server.
00137   hostent *hp = gethostbyname(host.c_str());
00138   if (! hp) {
00139     vcl_cerr << __FILE__ ": failed to lookup host\n";
00140 
00141 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00142     closesocket(tcp_socket);
00143 #else
00144     close(tcp_socket);
00145 #endif
00146 
00147     return 0;
00148   }
00149 
00150   // make socket address.
00151   sockaddr_in my_addr;
00152   my_addr.sin_family = AF_INET;
00153   // convert port number to network byte order..
00154   my_addr.sin_port = htons(port);
00155   vcl_memcpy(&my_addr.sin_addr, hp->h_addr_list[0], hp->h_length);
00156 
00157   // connect to server.
00158   if (connect(tcp_socket , (sockaddr *) &my_addr, sizeof my_addr) < 0) {
00159     vcl_cerr << __FILE__ ": failed to connect to host\n";
00160     //perror(__FILE__);
00161 
00162 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00163     closesocket(tcp_socket);
00164 #else
00165     close(tcp_socket);
00166 #endif
00167 
00168     return 0;
00169   }
00170 
00171   // buffer for data transfers over socket.
00172   char buffer[4096];
00173 
00174   // send HTTP 1.1 request.
00175   vcl_snprintf(buffer, 4090-vcl_strlen(buffer),
00176                "GET %s HTTP/1.1\r\nUser-Agent: vul_url\r\nHost: %s\r\nAccept: */*\r\n",
00177                url, host.c_str());
00178 
00179   if (auth != "")
00180     vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer),
00181                  "Authorization: Basic %s\r\n",
00182                  vul_url::encode_base64(auth).c_str());
00183 
00184   if (vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer), "\r\n") < 0)
00185   {
00186     vcl_cerr << "ERROR: vul_http_open buffer overflow.";
00187     vcl_abort();
00188   }
00189 
00190 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00191   if (send(tcp_socket, buffer, vcl_strlen(buffer), 0) < 0) {
00192 #else
00193   if (::write(tcp_socket, buffer, vcl_strlen(buffer)) < 0) {
00194 #endif
00195     vcl_cerr << __FILE__ ": error sending HTTP request\n";
00196 
00197 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00198     closesocket(tcp_socket);
00199 #else
00200     close(tcp_socket);
00201 #endif
00202     return 0;
00203   }
00204 
00205 
00206   // read from socket into memory.
00207   vcl_string contents;
00208   {
00209     int n;
00210 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00211     while ((n = recv(tcp_socket, buffer, sizeof buffer,0 )) > 0) {
00212 #else
00213     while ((n = ::read(tcp_socket, buffer, sizeof buffer)) > 0) {
00214 #endif
00215       contents.append(buffer, n);
00216 #ifdef DEBUG
00217       vcl_cerr << n << " bytes\n";
00218 #endif
00219     }
00220   }
00221 
00222   // close connection to server.
00223 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00224   closesocket(tcp_socket);
00225 #else
00226   close(tcp_socket);
00227 #endif
00228 
00229 #ifdef DEBUG
00230   vcl_cerr << "HTTP server returned:\n" << contents << '\n';
00231 #endif
00232 
00233   if (contents.find("HTTP/1.1 200") == contents.npos)
00234   {
00235     return 0;
00236   }
00237   vcl_string::size_type n = contents.find("\r\n\r\n");
00238   if (n == contents.npos)
00239   {
00240     return 0;
00241   }
00242 
00243   contents.erase(0,n+4);
00244 #ifdef DEBUG
00245   vcl_cerr << "vul_url::vul_http_open() returns:\n" << contents << '\n';
00246 #endif
00247   return new vcl_istringstream(contents);
00248 }
00249 
00250 
00251 //: only call this method with a correctly formatted http URL
00252 bool vul_http_exists(char const *url)
00253 {
00254   // split URL into auth, host, path and port number.
00255   vcl_string host;
00256   vcl_string path;
00257   vcl_string auth;
00258   int port = 80; // default
00259   assert (vcl_strncmp(url, "http://", 7) == 0);
00260 
00261   char const *p = url + 7;
00262   while (*p && *p!='/')
00263     ++ p;
00264   host = vcl_string(url+7, p);
00265 
00266 
00267   if (*p)
00268     path = p+1; // may be the empty string, if URL ends in a slash
00269   else
00270     path = "";
00271 
00272   //authentication
00273   for (unsigned int i=0; i<host.size(); ++i)
00274     if (host[i] == '@') {
00275       auth = vcl_string(host.c_str(), host.c_str()+i);
00276       host = vcl_string(host.c_str()+i+1, host.c_str() + host.size());
00277       break;
00278     }
00279 
00280   // port?
00281   for (unsigned int i=0; i<host.size(); ++i)
00282     if (host[i] == ':') {
00283       port = vcl_atoi(host.c_str() + i + 1);
00284       host = vcl_string(host.c_str(), host.c_str() + i);
00285       break;
00286     }
00287 
00288   // do character translation
00289   unsigned k =0;
00290   while (k < path.size())
00291   {
00292     if (path[k] == ' ')
00293       path.replace(k, 1, "%20");
00294     else if (path[k] == '%')
00295       path.replace(k, 1, "%25");
00296     k++;
00297   }
00298 
00299   // so far so good.
00300 #ifdef DEBUG
00301   vcl_cerr << "auth = \'" << auth << "\'\n"
00302            << "host = \'" << host << "\'\n"
00303            << "path = \'" << path << "\'\n"
00304            << "port = " << port << vcl_endl;
00305 #endif
00306 
00307 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00308   if (called_WSAStartup==0)
00309   {
00310     WORD wVersionRequested;
00311     WSADATA wsaData;
00312 
00313     wVersionRequested = MAKEWORD( 2, 2 );
00314 
00315     /* int err = */ WSAStartup( wVersionRequested, &wsaData );
00316   }
00317 #endif
00318 
00319   // create socket endpoint.
00320   SOCKET tcp_socket = socket(PF_INET,      // IPv4 protocols.
00321                              SOCK_STREAM,  // two-way, reliable,
00322                                            // connection-based stream socket.
00323                              PF_UNSPEC);   // protocol number.
00324 
00325 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00326   if (tcp_socket == INVALID_SOCKET) {
00327 # ifndef NDEBUG
00328     vcl_cerr << "error code : " << WSAGetLastError() << vcl_endl;
00329 # endif
00330 #else
00331   if (tcp_socket < 0) {
00332 #endif
00333     vcl_cerr << __FILE__ ": failed to create socket.\n";
00334     return false;
00335   }
00336 
00337 #ifdef DEBUG
00338   vcl_cerr << __FILE__ ": tcp_socket = " << tcp_socket << vcl_endl;
00339 #endif
00340 
00341   // get network address of server.
00342   hostent *hp = gethostbyname(host.c_str());
00343   if (! hp) {
00344     vcl_cerr << __FILE__ ": failed to lookup host\n";
00345     return false;
00346   }
00347 
00348   // make socket address.
00349   sockaddr_in my_addr;
00350   my_addr.sin_family = AF_INET;
00351     // convert port number to network byte order..
00352   my_addr.sin_port = htons(port);
00353   vcl_memcpy(&my_addr.sin_addr, hp->h_addr_list[0], hp->h_length);
00354 
00355   // connect to server.
00356   if (connect(tcp_socket , (sockaddr *) &my_addr, sizeof my_addr) < 0)
00357   {
00358     vcl_cerr << __FILE__ ": failed to connect to host\n";
00359     //perror(__FILE__);
00360 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00361     closesocket(tcp_socket);
00362 #else
00363     close(tcp_socket);
00364 #endif
00365 
00366     return false;
00367   }
00368 
00369   // buffer for data transfers over socket.
00370   char buffer[4096];
00371 
00372   // send HTTP 1.1 request.
00373   vcl_snprintf(buffer, 4090,
00374                "HEAD %s HTTP/1.1\r\nUser-Agent: vul_url\r\nHost: %s\r\nAccept: */*\r\n",
00375                url, host.c_str());
00376   if (auth != "")
00377     vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer),
00378                  "Authorization: Basic %s\r\n",
00379                  vul_url::encode_base64(auth).c_str() );
00380 
00381   if (vcl_snprintf(buffer+vcl_strlen(buffer), 4090-vcl_strlen(buffer), "\r\n") < 0)
00382   {
00383     vcl_cerr << "ERROR: vul_http_exists buffer overflow.";
00384     vcl_abort();
00385   }
00386 
00387 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00388   if (send(tcp_socket, buffer, vcl_strlen(buffer), 0) < 0) {
00389 #else
00390   if (::write(tcp_socket, buffer, vcl_strlen(buffer)) < 0) {
00391 #endif
00392     vcl_cerr << __FILE__ ": error sending HTTP request\n";
00393 
00394 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00395     closesocket(tcp_socket);
00396 #else
00397     close(tcp_socket);
00398 #endif
00399     return false;
00400   }
00401 
00402 
00403   // read from socket into memory.
00404   vcl_string contents;
00405   {
00406     int n;
00407 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00408     if ((n = recv(tcp_socket, buffer, sizeof buffer,0 )) > 0) {
00409 #else
00410     if ((n = ::read(tcp_socket, buffer, sizeof buffer)) > 0) {
00411 #endif
00412       contents.append(buffer, n);
00413       //vcl_cerr << n << " bytes\n";
00414     }
00415     else
00416     {
00417 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00418       closesocket(tcp_socket);
00419 #else
00420       close(tcp_socket);
00421 #endif
00422       return false;
00423     }
00424   }
00425 
00426   // close connection to server.
00427 #if defined(VCL_WIN32) && !defined(__CYGWIN__)
00428   closesocket(tcp_socket);
00429 #else
00430   close(tcp_socket);
00431 #endif
00432 
00433 #ifdef DEBUG
00434   vcl_cerr << "HTTP server returned:\n" << contents << '\n';
00435 #endif
00436 
00437   return contents.find("HTTP/1.1 200") != contents.npos;
00438 }
00439 
00440 
00441 vcl_istream * vul_url::open(const char * url, vcl_ios_openmode mode)
00442 {
00443   // check for null pointer or empty strings.
00444   if (!url || !*url)
00445     return 0;
00446   unsigned l = vcl_strlen(url);
00447 
00448   // check for filenames beginning "file:".
00449   if (l > 7 && vcl_strncmp(url, "file://", 7) == 0)
00450     return new vcl_ifstream(url+7,mode);
00451 
00452   // maybe it's an http URL?
00453   if (l > 7 && vcl_strncmp(url, "http://", 7) == 0)
00454     return vul_http_open(url);
00455 
00456   // maybe it's an ftp URL?
00457   if (l > 6 && vcl_strncmp(url, "ftp://", 6) == 0)
00458   {
00459     vcl_cerr << __LINE__ << "ERROR:\n vul_read_url(const char * url)\n"
00460       "Doesn't support FTP yet, url=" << url << vcl_endl;
00461     return 0;
00462   }
00463 
00464   // try an ordinary filename
00465   return new vcl_ifstream(url, mode);
00466 }
00467 
00468 
00469 //: Does that URL exist
00470 bool vul_url::exists(const char * url)
00471 {
00472   // check for null pointer or empty strings.
00473   if (!url || !*url)
00474     return false;
00475   unsigned l = vcl_strlen(url);
00476 
00477   // check for filenames beginning "file:".
00478   if (l > 7 && vcl_strncmp(url, "file://", 7) == 0)
00479     return vul_file::exists(url+7);
00480 
00481   // maybe it's an http URL?
00482   if (l > 7 && vcl_strncmp(url, "http://", 7) == 0)
00483     return vul_http_exists(url);
00484 
00485   // maybe it's an ftp URL?
00486   if (l > 6 && vcl_strncmp(url, "ftp://", 6) == 0)
00487   {
00488     vcl_cerr << "ERROR: vul_read_url(const char * url)\n"
00489       "Doesn't support FTP yet, url=" << url << vcl_endl;
00490     return false;
00491   }
00492 
00493   // try an ordinary filename
00494   return vul_file::exists(url);
00495 }
00496 
00497 //: Is that a URL
00498 bool vul_url::is_url(const char * url)
00499 {
00500   // check for null pointer or empty strings.
00501   if (!url || !*url)
00502     return false;
00503   unsigned l = vcl_strlen(url);
00504 
00505   // check for filenames beginning "file:".
00506   if (l > 7 && vcl_strncmp(url, "file://", 7) == 0)
00507     return true;
00508 
00509   // maybe it's an http URL?
00510   if (l > 7 && vcl_strncmp(url, "http://", 7) == 0)
00511     return true;
00512 
00513   // maybe it's an ftp URL?
00514   if (l > 6 && vcl_strncmp(url, "ftp://", 6) == 0)
00515     return true;
00516 
00517   return false;
00518 }
00519 
00520 //=======================================================================
00521 
00522 bool vul_url::is_file(const char * fn)
00523 {
00524   if (vul_url::is_url(fn))
00525     return vul_url::exists(fn);
00526   else
00527     return vul_file::exists(fn) && ! vul_file::is_directory(fn);
00528 }
00529 
00530 //=======================================================================
00531 
00532 static const
00533 char base64_encoding[]=
00534 {
00535   'A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P',
00536   'Q','R','S','T','U','V','W','X','Y','Z','a','b','c','d','e','f',
00537   'g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v',
00538   'w','x','y','z','0','1','2','3','4','5','6','7','8','9','+','/'
00539 };
00540 
00541 static char out_buf[4];
00542 
00543 static const char * encode_triplet(char data[3], unsigned n)
00544 {
00545   assert (n>0 && n <4);
00546   out_buf[0] = base64_encoding[(data[0] & 0xFC) >> 2];
00547   out_buf[1] = base64_encoding[
00548     ((data[0] & 0x3) << 4) + ((data[1] & 0xf0)>>4)];
00549 
00550   if (n==1)
00551   {
00552     out_buf[2] = out_buf[3] = '=';
00553     return out_buf;
00554   }
00555 
00556   out_buf[2] = base64_encoding[
00557     ((data[1] & 0xf) << 2) + ((data[2] & 0xc0)>>6)];
00558 
00559   if (n==2)
00560   {
00561     out_buf[3] = '=';
00562     return out_buf;
00563   }
00564 
00565   out_buf[3] = base64_encoding[ (data[2] & 0x3f) ];
00566   return out_buf;
00567 }
00568 
00569 //=======================================================================
00570 
00571 vcl_string vul_url::encode_base64(const vcl_string& in)
00572 {
00573   vcl_string out;
00574   unsigned i = 0, line_octets = 0;
00575   const unsigned l = in.size();
00576   char data[3];
00577   while (i <= l)
00578   {
00579     if (i == l)
00580     {
00581       out.append("=");
00582       return out;
00583     }
00584 
00585     data[0] = in[i++];
00586     data[1] = data[2] = 0;
00587 
00588     if (i == l)
00589     {
00590       out.append(encode_triplet(data,1),4);
00591       return out;
00592     }
00593 
00594     data[1] = in[i++];
00595 
00596     if (i == l)
00597     {
00598       out.append(encode_triplet(data,2),4);
00599       return out;
00600     }
00601 
00602     data[2] = in[i++];
00603 
00604     out.append(encode_triplet(data,3),4);
00605 
00606     if (line_octets >= 68/4) // print carriage return
00607     {
00608       out.append("\r\n",2);
00609       line_octets = 0;
00610     }
00611     else
00612       ++line_octets;
00613   }
00614 
00615   return out;
00616 }
00617 
00618 //=======================================================================
00619 
00620 static int get_next_char(const vcl_string &in, unsigned int *i)
00621 {
00622   while (*i < in.size())
00623   {
00624     char c;
00625     c = in[(*i)++];
00626 
00627     if (c == '+')
00628       return 62;
00629 
00630     if (c == '/')
00631       return 63;
00632 
00633     if (c >= 'A' && c <= 'Z')
00634       return 0 + (int)c - (int)'A';
00635 
00636     if (c >= 'a' && c <= 'z')
00637       return 26 + (int)c - (int)'a';
00638 
00639     if (c >= '0' && c <= '9')
00640       return 52 + (int)c - (int)'0';
00641 
00642     if (c == '=')
00643       return 64;
00644   }
00645   return -1;
00646 }
00647 
00648 //=======================================================================
00649 
00650 vcl_string vul_url::decode_base64(const vcl_string& in)
00651 {
00652   int c;
00653   char data[3];
00654 
00655   unsigned i=0;
00656   const unsigned l = in.size();
00657   vcl_string out;
00658   while (i < l)
00659   {
00660     data[0] = data[1] = data[2] = 0;
00661 
00662     // -=- 0 -=-
00663     // Search next valid char...
00664     c = get_next_char(in , &i);
00665 
00666     // treat '=' as end of message
00667     if (c == 64)
00668       return out;
00669     if (c==-1)
00670       return "";
00671 
00672     data[0] = char(((c & 0x3f) << 2) | (0x3 & data[0]));
00673 
00674     // -=- 1 -=-
00675     // Search next valid char...
00676     c = get_next_char(in , &i);
00677 
00678       // Error! Second character in octet can't be '='
00679     if (c == 64 || c==-1)
00680       return "";
00681 
00682     data[0] = char(((c & 0x30) >> 4) | (0xfc & data[0]));
00683     data[1] = char(((c & 0x0f) << 4) | (0x0f & data[1]));
00684 
00685     // -=- 2 -=-
00686     // Search next valid char...
00687 
00688     c = get_next_char(in , &i);
00689 
00690     if (c==-1)
00691       return "";
00692     if (c == 64)
00693     {
00694       // should really read next char and check it is '='
00695       out.append(data,1);  // write 1 byte to output
00696       return out;
00697     }
00698 
00699     data[1] = char(((c & 0x3c) >> 2) | (0xf0 & data[1]));
00700     data[2] = char(((c & 0x03) << 6) | (0x3f & data[2]));
00701 
00702     // -=- 3 -=-
00703     // Search next valid char...
00704     c = get_next_char(in , &i);
00705 
00706     if (c==-1)
00707       return "";
00708 
00709     if (c == 64)
00710     {
00711       out.append(data,2);  // write 2 bytes to output
00712       return out;
00713     }
00714 
00715     data[2] = char((c & 0x3f) | (0xc0 & data[2]));
00716 
00717     out.append(data,3);  // write 3 bytes to output
00718   }
00719 
00720   return out;
00721 }