duckscraper/lib/tidy/experimental/httpio.c

224 lines
5.9 KiB
C

#include "tmbstr.h"
#include "httpio.h"
int
makeConnection ( HTTPInputSource *pHttp )
{
struct sockaddr_in sock;
struct hostent *pHost;
/* Get internet address of the host. */
if (!(pHost = gethostbyname ( pHttp->pHostName )))
{
return -1;
}
/* Copy the address of the host to socket description. */
memcpy (&sock.sin_addr, pHost->h_addr, pHost->h_length);
/* Set port and protocol */
sock.sin_family = AF_INET;
sock.sin_port = htons( pHttp->nPort );
/* Make an internet socket, stream type. */
if ((pHttp->s = socket (AF_INET, SOCK_STREAM, 0)) == -1)
return -1;
/* Connect the socket to the remote host. */
if (connect (pHttp->s, (struct sockaddr *) &sock, sizeof( sock )))
{
if (errno == ECONNREFUSED)
return ECONNREFUSED;
else
return -1;
}
return 0;
}
int parseURL( HTTPInputSource *pHttp, tmbstr url )
{
int i, j = 0;
ctmbstr pStr;
pStr = tmbsubstr( url, "://" );
/* If protocol is there, but not http, bail out, else assume http. */
if (NULL != pStr)
{
if (tmbstrncasecmp( url, "http://", 7 ))
return -1;
}
if (NULL != pStr)
j = pStr - url + 3;
for (i = j; url[i] && url[i] != ':' && url[i] != '/'; i++) {}
if (i == j)
return -1;
/* Get the hostname. */
pHttp->pHostName = tmbstrndup (&url[j], i - j );
if (url[i] == ':')
{
/* We have a colon delimiting the hostname. It should mean that
a port number is following it */
pHttp->nPort = 0;
if (isdigit( url[++i] )) /* A port number */
{
for (; url[i] && url[i] != '/'; i++)
{
if (isdigit( url[i] ))
pHttp->nPort = 10 * pHttp->nPort + (url[i] - '0');
else
return -1;
}
if (!pHttp->nPort)
return -1;
}
else /* or just a misformed port number */
return -1;
}
else
/* Assume default port. */
pHttp->nPort = 80;
/* skip past the delimiting slash (we'll add it later ) */
while (url[i] && url[i] == '/')
i++;
pHttp->pResource = tmbstrdup (url + i );
return 0;
}
int fillBuffer( HTTPInputSource *in )
{
if (0 < in->s)
{
in->nBufSize = recv( in->s, in->buffer, sizeof( in->buffer ), 0);
in->nextBytePos = 0;
if (in->nBufSize < sizeof( in->buffer ))
in->buffer[in->nBufSize] = '\0';
}
else
in->nBufSize = 0;
return in->nBufSize;
}
int openURL( HTTPInputSource *in, tmbstr pUrl )
{
int rc = -1;
#ifdef WIN32
WSADATA wsaData;
rc = WSAStartup( 514, &wsaData );
#endif
in->tis.getByte = (TidyGetByteFunc) HTTPGetByte;
in->tis.ungetByte = (TidyUngetByteFunc) HTTPUngetByte;
in->tis.eof = (TidyEOFFunc) HTTPIsEOF;
in->tis.sourceData = (uint) in;
in->nextBytePos = in->nextUnGotBytePos = in->nBufSize = 0;
parseURL( in, pUrl );
if (0 == (rc = makeConnection( in )))
{
char ch, lastCh = '\0';
int blanks = 0;
char *getCmd = MemAlloc( 48 + strlen( in->pResource ));
sprintf( getCmd, "GET /%s HTTP/1.0\r\nAccept: text/html\r\n\r\n", in->pResource );
send( in->s, getCmd, strlen( getCmd ), 0 );
MemFree( getCmd );
/* skip past the header information */
while ( in->nextBytePos >= in->nBufSize
&& 0 < (rc = fillBuffer( in )))
{
if (1 < blanks)
break;
for (; in->nextBytePos < sizeof( in->buffer )
&& 0 != in->buffer[ in->nextBytePos ];
in->nextBytePos++ )
{
ch = in->buffer[ in->nextBytePos ];
if (ch == '\r' || ch == '\n')
{
if (ch == lastCh)
{
/* Two carriage returns or two newlines in a row,
that's good enough */
blanks++;
}
if (lastCh == '\r' || lastCh == '\n')
{
blanks++;
}
}
else
blanks = 0;
lastCh = ch;
if (1 < blanks)
{
/* end of header, scan to first non-white and return */
while ('\0' != ch && isspace( ch ))
ch = in->buffer[ ++in->nextBytePos ];
break;
}
}
}
}
return rc;
}
void closeURL( HTTPInputSource *source )
{
if (0 < source->s)
closesocket( source->s );
source->s = -1;
source->tis.sourceData = 0;
#ifdef WIN32
WSACleanup();
#endif
}
int HTTPGetByte( HTTPInputSource *source )
{
if (source->nextUnGotBytePos)
return source->unGetBuffer[ --source->nextUnGotBytePos ];
if (0 != source->nBufSize && source->nextBytePos >= source->nBufSize)
{
fillBuffer( source );
}
if (0 == source->nBufSize)
return EndOfStream;
return source->buffer[ source->nextBytePos++ ];
}
void HTTPUngetByte( HTTPInputSource *source, uint byteValue )
{
if (source->nextUnGotBytePos < 16 ) /* Only you can prevent buffer overflows */
source->unGetBuffer[ source->nextUnGotBytePos++ ] = (char) byteValue;
}
Bool HTTPIsEOF( HTTPInputSource *source )
{
if (source->nextUnGotBytePos)
/* pending ungot bytes, not done */
return no;
if ( 0 != source->nBufSize
&& source->nextBytePos >= source->nBufSize)
/* We've consumed the existing buffer, get another */
fillBuffer( source );
if (source->nextBytePos < source->nBufSize)
/* we have stuff in the buffer, must not be done. */
return no;
/* Nothing in the buffer, and the last receive failed, must be done. */
return yes;
}