36 #include "BESInternalError.h"
37 #include "BESForbiddenError.h"
38 #include "BESSyntaxUserError.h"
39 #include "BESNotFoundError.h"
40 #include "BESTimeoutError.h"
45 #include "HttpCache.h"
46 #include "HttpUtils.h"
47 #include "CurlUtils.h"
48 #include "HttpNames.h"
49 #include "RemoteResource.h"
50 #include "TheBESKeys.h"
51 #include "BESStopWatch.h"
56 #define BES_CATALOG_ROOT_KEY "BES.Catalog.catalog.RootDirectory"
58 #define prolog std::string("RemoteResource::").append(__func__).append("() - ")
65 RemoteResource::RemoteResource(
const std::string &url,
const std::string &uid){
67 d_initialized =
false;
71 d_resourceCacheFileName.clear();
72 d_response_headers =
new vector<string>();
73 d_http_response_headers =
new map<string, string>();
76 throw BESInternalError(prolog +
"Remote resource URL is empty.", __FILE__, __LINE__);
79 if(url.find(FILE_PROTOCOL) == 0){
80 d_resourceCacheFileName = url.substr(strlen(FILE_PROTOCOL));
83 d_resourceCacheFileName = d_resourceCacheFileName.substr(0,d_resourceCacheFileName.length()-1);
90 throw BESInternalError( prolog +
"ERROR - "+ BES_CATALOG_ROOT_KEY +
"is not set",__FILE__,__LINE__);
92 if(d_resourceCacheFileName.find(catalog_root) !=0 ){
97 else if(url.find(HTTPS_PROTOCOL) == 0 || url.find(HTTP_PROTOCOL) == 0){
98 d_remoteResourceUrl = url;
99 BESDEBUG(MODULE, prolog <<
"URL: " << d_remoteResourceUrl << endl);
103 string client_id_hdr =
"User-Id: " + d_uid;
104 BESDEBUG(MODULE, prolog << client_id_hdr << endl);
105 d_request_headers.push_back(client_id_hdr);
107 if (!d_echo_token.empty()){
108 string echo_token_hdr =
"Echo-Token: " + d_echo_token;
109 BESDEBUG(MODULE, prolog << echo_token_hdr << endl);
110 d_request_headers.push_back(echo_token_hdr);
116 string err = prolog +
"Unsupported protocol: " + url;
133 RemoteResource::RemoteResource(
const std::string &url,
const std::string &uid,
const std::string &echo_token) {
136 d_initialized =
false;
139 d_echo_token = echo_token;
143 d_resourceCacheFileName.clear();
144 d_response_headers =
new vector<string>();
145 d_request_headers =
new vector<string>();
146 d_http_response_headers =
new map<string, string>();
149 throw BESInternalError(prolog +
"Remote resource URL is empty.", __FILE__, __LINE__);
152 if(url.find(FILE_PROTOCOL) == 0){
153 d_resourceCacheFileName = url.substr(strlen(FILE_PROTOCOL));
156 d_resourceCacheFileName = d_resourceCacheFileName.substr(0,d_resourceCacheFileName.length()-1);
163 throw BESInternalError( prolog +
"ERROR - "+ BES_CATALOG_ROOT_KEY +
"is not set",__FILE__,__LINE__);
165 if(d_resourceCacheFileName.find(catalog_root) !=0 ){
170 else if(url.find(HTTPS_PROTOCOL) == 0 || url.find(HTTP_PROTOCOL) == 0){
171 d_remoteResourceUrl = url;
172 BESDEBUG(MODULE, prolog <<
"URL: " << d_remoteResourceUrl << endl);
175 string client_id_hdr =
"User-Id: " + d_uid;
176 BESDEBUG(MODULE, prolog << client_id_hdr << endl);
177 d_request_headers->push_back(client_id_hdr);
179 if (!d_echo_token.empty()){
180 string echo_token_hdr =
"Echo-Token: " + d_echo_token;
181 BESDEBUG(MODULE, prolog << echo_token_hdr << endl);
182 d_request_headers->push_back(echo_token_hdr);
186 string err = prolog +
"Unsupported protocol: " + url;
201 RemoteResource::~RemoteResource() {
202 BESDEBUG(MODULE, prolog <<
"BEGIN resourceURL: " << d_remoteResourceUrl << endl);
204 delete d_response_headers;
205 d_response_headers = 0;
206 BESDEBUG(MODULE, prolog <<
"Deleted d_response_headers." << endl);
209 if (!d_resourceCacheFileName.empty()) {
210 HttpCache *cache = HttpCache::get_instance();
213 BESDEBUG(MODULE, prolog <<
"Closed and unlocked " << d_resourceCacheFileName << endl);
214 d_resourceCacheFileName.clear();
220 curl_easy_cleanup(d_curl);
221 BESDEBUG(MODULE, prolog <<
"Called curl_easy_cleanup()." << endl);
225 BESDEBUG(MODULE, prolog <<
"Clearing resourceURL: " << d_remoteResourceUrl << endl);
226 d_remoteResourceUrl.clear();
227 BESDEBUG(MODULE, prolog <<
"END" << endl);
234 std::string RemoteResource::getCacheFileName() {
235 if (!d_initialized) {
236 throw BESInternalError(prolog +
"STATE ERROR: Remote Resource " + d_remoteResourceUrl +
237 " has Not Been Retrieved.", __FILE__, __LINE__);
239 return d_resourceCacheFileName;
249 void RemoteResource::retrieveResource() {
251 string replace_value;
252 retrieveResource(template_key,replace_value);
266 void RemoteResource::retrieveResource(
const string &template_key,
const string &replace_value) {
267 BESDEBUG(MODULE, prolog <<
"BEGIN resourceURL: " << d_remoteResourceUrl << endl);
271 BESDEBUG(MODULE, prolog <<
"END Already initialized." << endl);
275 HttpCache *cache = HttpCache::get_instance();
278 oss << prolog <<
"FAILED to get local cache. ";
279 oss <<
"Unable to proceed with request for " << this->d_remoteResourceUrl;
280 oss <<
" The server MUST have a valid HTTP cache configuration to operate." << endl;
281 BESDEBUG(MODULE, oss.str());
289 BESDEBUG(MODULE, prolog <<
"d_resourceCacheFileName: " << d_resourceCacheFileName << endl);
297 http::get_type_from_url(d_remoteResourceUrl, d_type);
298 BESDEBUG(MODULE, prolog <<
"d_type: " << d_type << endl);
303 prolog <<
"Remote resource is already in cache. cache_file_name: " << d_resourceCacheFileName
309 std::ifstream hdr_ifs(hdr_filename.c_str());
311 BESDEBUG(MODULE, prolog <<
"Reading response headers from: " << hdr_filename << endl);
312 for (std::string line; std::getline(hdr_ifs, line);) {
313 (*d_response_headers).push_back(line);
314 BESDEBUG(MODULE, prolog <<
"header: " << line << endl);
321 ingest_http_headers_and_type();
322 d_initialized =
true;
334 writeResourceToFile(d_fd);
338 unlink(d_resourceCacheFileName.c_str());
347 if(!template_key.empty()){
348 unsigned int count = filter_retrieved_resource(template_key, replace_value);
349 BESDEBUG(MODULE, prolog <<
"Replaced " << count <<
350 " instance(s) of template(" <<
351 template_key <<
") with " << replace_value <<
" in cached RemoteResource" << endl);
360 std::ofstream hdr_out(hdr_filename.c_str());
362 for (
size_t i = 0; i < this->d_response_headers->size(); i++) {
363 hdr_out << (*d_response_headers)[i] << endl;
369 unlink(hdr_filename.c_str());
370 unlink(d_resourceCacheFileName.c_str());
380 BESDEBUG(MODULE, prolog <<
"Converted exclusive cache lock to shared lock." << endl);
386 BESDEBUG(MODULE, prolog <<
"Updated cache info" << endl);
390 BESDEBUG(MODULE, prolog <<
"Updated and purged cache." << endl);
392 BESDEBUG(MODULE, prolog <<
"END" << endl);
393 d_initialized =
true;
398 prolog <<
"Remote resource is in cache. cache_file_name: " << d_resourceCacheFileName
400 d_initialized =
true;
405 string msg = prolog +
"Failed to acquire cache read lock for remote resource: '";
406 msg += d_remoteResourceUrl +
"\n";
411 BESDEBUG(MODULE, prolog <<
"Caught BESError. type: " << besError.
get_bes_error_type() <<
414 " Will unlock cache and re-throw." << endl);
419 BESDEBUG(MODULE, prolog <<
"Caught unknown exception. Will unlock cache and re-throw." << endl);
434 void RemoteResource::writeResourceToFile(
int fd) {
436 BESDEBUG(MODULE, prolog <<
"BEGIN" << endl);
441 besTimer.
start(prolog +
"source url: " + d_remoteResourceUrl);
444 BESDEBUG(MODULE, prolog <<
"Saving resource " << d_remoteResourceUrl <<
" to cache file " << d_resourceCacheFileName << endl);
445 curl::http_get_and_write_resource(d_remoteResourceUrl, fd, d_response_headers);
447 BESDEBUG(MODULE, prolog <<
"Resource " << d_remoteResourceUrl <<
" saved to cache file " << d_resourceCacheFileName << endl);
453 int status = lseek(fd, 0, SEEK_SET);
455 throw BESError(
"Could not seek within the response.", BES_NOT_FOUND_ERROR, __FILE__, __LINE__);
456 BESDEBUG(MODULE, prolog <<
"Reset file descriptor." << endl);
459 ingest_http_headers_and_type();
464 BESDEBUG(MODULE, prolog <<
"END" << endl);
470 void RemoteResource::ingest_http_headers_and_type() {
471 BESDEBUG(MODULE, prolog <<
"BEGIN" << endl);
473 const string colon_space =
": ";
474 for (
size_t i = 0; i < this->d_response_headers->size(); i++) {
475 string header = (*d_response_headers)[i];
476 BESDEBUG(MODULE, prolog <<
"Processing header " << header << endl);
477 size_t colon_index = header.find(colon_space);
478 if(colon_index == string::npos){
479 BESDEBUG(MODULE, prolog <<
"Unable to locate the colon space \": \" delimiter in the header " <<
480 "string: '" << header <<
"' SKIPPING!" << endl);
484 string value = header.substr(colon_index + colon_space.length());
485 BESDEBUG(MODULE, prolog <<
"key: " << key <<
" value: " << value << endl);
486 (*d_http_response_headers)[key] = value;
489 std::map<string, string>::iterator it;
495 string content_disp_hdr;
496 content_disp_hdr = get_http_response_header(
"content-disposition");
497 if (!content_disp_hdr.empty()) {
501 BESDEBUG(MODULE,prolog <<
"Evaluated content-disposition '" << content_disp_hdr <<
"' matched type: \"" << type <<
"\"" << endl);
508 string content_type = get_http_response_header(
"content-type");
509 if (type.empty() && !content_type.empty()) {
510 http::get_type_from_content_type(content_type, type);
511 BESDEBUG(MODULE,prolog <<
"Evaluated content-type '" << content_type <<
"' matched type \"" << type <<
"\"" << endl);
517 http::get_type_from_url(d_remoteResourceUrl, type);
518 BESDEBUG(MODULE, prolog <<
"Evaluated url '" << d_remoteResourceUrl <<
"' matched type: \"" << type <<
"\"" << endl);
523 string err = prolog +
"Unable to determine the type of data"
524 +
" returned from '" + d_remoteResourceUrl +
"' Setting type to 'unknown'";
525 BESDEBUG(MODULE, err << endl);
530 BESDEBUG(MODULE, prolog <<
"END (dataset type: " << d_type <<
")" << endl);
539 RemoteResource::get_http_response_header(
const std::string header_name) {
541 std::map<string, string>::iterator it;
543 if (it != d_http_response_headers->end())
559 unsigned int RemoteResource::filter_retrieved_resource(
const std::string &template_str,
const std::string &update_str){
560 unsigned int replace_count = 0;
564 std::ifstream cr_istrm(d_resourceCacheFileName);
565 if (!cr_istrm.is_open()) {
566 string msg =
"Could not open '" + d_resourceCacheFileName +
"' to read cached response.";
567 BESDEBUG(MODULE, prolog << msg << endl);
570 std::stringstream buffer;
571 buffer << cr_istrm.rdbuf();
572 string resource_content(buffer.str());
577 while ((startIndex = resource_content.find(template_str)) != -1) {
578 resource_content.erase(startIndex, template_str.length());
579 resource_content.insert(startIndex, update_str);
585 std::ofstream cr_ostrm(d_resourceCacheFileName);
586 if (!cr_ostrm.is_open()) {
587 string msg =
"Could not open '" + d_resourceCacheFileName +
"' to write modified cached response.";
588 BESDEBUG(MODULE, prolog << msg << endl);
591 cr_ostrm << resource_content;
593 return replace_count;
599 std::string RemoteResource::get_response_as_string() {
603 msg <<
"ERROR. Internal state error. " << __PRETTY_FUNCTION__ <<
" was called prior to retrieving resource.";
604 BESDEBUG(MODULE, prolog << msg.str() << endl);
607 string cache_file = getCacheFileName();
610 std::ifstream file_istream(cache_file, std::ofstream::in);
613 if(file_istream.is_open()){
615 BESDEBUG(MODULE, prolog <<
"Using cached file: " << cache_file << endl);
616 std::stringstream buffer;
617 buffer << file_istream.rdbuf();
622 msg <<
"ERROR. Failed to open cache file " << cache_file <<
" for reading.";
623 BESDEBUG(MODULE, prolog << msg.str() << endl);
637 string response = get_response_as_string();
639 d.Parse(response.c_str());
646 vector<string> *RemoteResource::getResponseHeaders() {
648 throw BESInternalError(prolog +
"STATE ERROR: Remote Resource Has Not Been Retrieved.",__FILE__,__LINE__);
650 return d_response_headers;
655 void RemoteResource::setType(
const vector<string> *resp_hdrs) {
657 BESDEBUG(MODULE, prolog <<
"BEGIN" << endl);
667 vector<string>::const_iterator i = resp_hdrs->begin();
668 vector<string>::const_iterator e = resp_hdrs->end();
669 for (; i != e; i++) {
670 string hdr_line = (*i);
672 BESDEBUG(MODULE, prolog <<
"Evaluating header: " << hdr_line << endl);
676 string colon_space =
": ";
677 int index = hdr_line.find(colon_space);
678 string hdr_name = hdr_line.substr(0, index);
679 string hdr_value = hdr_line.substr(index + colon_space.length());
681 BESDEBUG(MODULE, prolog <<
"hdr_name: '" << hdr_name <<
"' hdr_value: '" << hdr_value <<
"' " << endl);
683 if (hdr_name.find(
"content-disposition") != string::npos) {
685 BESDEBUG(MODULE, prolog <<
"Located content-disposition header." << endl);
688 if (hdr_name.find(
"content-type") != string::npos) {
689 BESDEBUG(MODULE, prolog <<
"Located content-type header." << endl);
698 HttpUtils::Get_type_from_disposition(disp, type);
699 BESDEBUG(MODULE,prolog <<
"Evaluated content-disposition '" << disp <<
"' matched type: \"" << type <<
"\"" << endl);
706 if (type.empty() && !ctype.empty()) {
707 HttpUtils::Get_type_from_content_type(ctype, type);
708 BESDEBUG(MODULE,prolog <<
"Evaluated content-type '" << ctype <<
"' matched type \"" << type <<
"\"" << endl);
714 HttpUtils::Get_type_from_url(d_remoteResourceUrl, type);
715 BESDEBUG(MODULE,prolog <<
"Evaluated url '" << d_remoteResourceUrl <<
"' matched type: \"" << type <<
"\"" << endl);
720 string err = prolog +
"Unable to determine the type of data"
721 +
" returned from '" + d_remoteResourceUrl +
"' Setting type to 'unknown'";
722 BESDEBUG(MODULE, err << endl);
static bool IsSet(const std::string &flagName)
see if the debug context flagName is set to true
Abstract exception class for the BES with basic string message.
virtual int get_bes_error_type()
Return the return code for this error class.
virtual int get_line()
get the line number where the exception was thrown
virtual std::string get_file()
get the file name where the exception was thrown
virtual std::string get_message()
get the error message for this exception
virtual void unlock_cache()
virtual void unlock_and_close(const std::string &target)
virtual unsigned long long update_cache_info(const std::string &target)
Update the cache info file to include 'target'.
virtual bool create_and_lock(const std::string &target, int &fd)
Create a file in the cache and lock it for write access.
virtual void exclusive_to_shared_lock(int fd)
Transfer from an exclusive lock to a shared lock.
virtual bool get_read_lock(const std::string &target, int &fd)
Get a read-only lock on the file if it exists.
virtual bool cache_too_big(unsigned long long current_size) const
look at the cache size; is it too large? Look at the cache size and see if it is too big.
virtual void update_and_purge(const std::string &new_file)
Purge files from the cache.
exception thrown if internal error encountered
virtual bool start(std::string name)
static bool endsWith(std::string const &fullString, std::string const &ending)
static std::string lowercase(const std::string &s)
static std::string pathConcat(const std::string &firstPart, const std::string &secondPart, char separator='/')
Concatenate path fragments making sure that they are separated by a single '/' character.
void get_value(const std::string &s, std::string &val, bool &found)
Retrieve the value of a given key, if set.
static TheBESKeys * TheKeys()
A cache for content accessed via HTTP.
virtual std::string get_cache_file_name(const std::string &uid, const std::string &src, bool mangle=true)
GenericDocument< UTF8<> > Document
GenericDocument with UTF8 encoding.
utility class for the HTTP catalog module
void get_type_from_disposition(const string &disp, string &type)