diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..89e08fb --- /dev/null +++ b/LICENSE @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/README.md b/README.md index bacbb20..b5a8b0e 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,7 @@ -# oohay - -Fork of wiby \ No newline at end of file +These are the source files for the Wiby search engine. + +Video demo: https://youtu.be/nCfWJqNBqHo + +Refer to the installation guide located in /html/about/guide.html + +You can also access it at http://wiby.me/about/guide.html diff --git a/c/abandoned.txt b/c/abandoned.txt new file mode 100755 index 0000000..e69de29 diff --git a/c/checkrobots.h b/c/checkrobots.h new file mode 100755 index 0000000..09abc13 --- /dev/null +++ b/c/checkrobots.h @@ -0,0 +1,257 @@ +#include +#include +#include +//#include //RHEL/Rocky +//#include //RHEL/Rocky +//#include //ubuntu 20/22 +//#include //ubuntu 20/22 + +//gcc checkrobots.c -o checkrobots -lcurl + +#define rwindow_len 100 +FILE *robotsfile; +char *robotsfilestr,robotsurl[1011],rwindow[rwindow_len]; +//char rURLpath[] = "/dumpop/"; + +size_t write_data_checkrobots(void *ptr, size_t size, size_t nmemb, FILE *stream) { + size_t written = fwrite(ptr, size, nmemb, stream); + return written; +} +int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length); + +//int main(int argc, char **argv) +int checkrobots(char *rURLprefix, char *rDomain, char *rURLpath) +{ + if(rURLprefix[0]==0 || rDomain[0]==0 || rURLpath[0]==0) + return 1; + if(strlen(rDomain)>253) + return 0; + if(strlen(rURLpath)>500) + return 0; + + memset(rwindow,'?',rwindow_len); +// rwindow[rwindow_len]=0; + + curl_global_init(CURL_GLOBAL_DEFAULT); + CURL *curl; + FILE *fp; + CURLcode res; + curl = curl_easy_init(); + memset(robotsurl,0,1011); + strcpy(robotsurl,rURLprefix); + strcat(robotsurl,rDomain); + strcat(robotsurl,"/robots.txt"); + char outfilename[300]; + memset(outfilename,0,300); + strcpy(outfilename,"robots/"); + strcat(outfilename,rDomain); + strcat(outfilename,".txt"); + long fsize=0,response_code_checkrobots=0; + char *finalURL_checkrobots = NULL; + int foundfile=0,alloced=0; + char rb,rwb; + printf("\nChecking robots.txt: "); + + //open robots.txt file and load into memory, or download it if it doesn't exist + if(robotsfile = fopen(outfilename, "rb")){ + fseek(robotsfile, 0, SEEK_END); + fsize = ftell(robotsfile); + fseek(robotsfile, 0, SEEK_SET); /* same as rewind(f); */ + + robotsfilestr = malloc(fsize + 1); + alloced=1; + if(fread(robotsfilestr, 1, fsize, robotsfile)){} + fclose(robotsfile); + + robotsfilestr[fsize] = 0; + //printf("%ld",fsize); + + foundfile=1; + }else if (curl) { + printf("Downloading... "); + if(fp = fopen(outfilename,"wb")){ + //set curl options + curl_easy_setopt(curl, CURLOPT_URL, robotsurl);// set URL to get here + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; WebCrawler; SearchEngine)"); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data_checkrobots);// send all data to this function // + curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 30L); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 15L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);//max num of redirects + curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 1000000L);//don't download if over 1MB + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);//0 or 1 to verify ssl + res = curl_easy_perform(curl);// get it! + curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL_checkrobots); + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code_checkrobots); + //curl_easy_cleanup(curl);// always cleanup (done further down) + fclose(fp); + if(response_code_checkrobots!=200){ + fp = fopen(outfilename,"wb"); + fclose(fp); + } + }else{ + curl_easy_cleanup(curl); + curl_global_cleanup(); + printf("\nFailed to create file: %s - proceeding anyway.",outfilename); + return 1; + } + } + if(response_code_checkrobots==200 && foundfile==0){ + robotsfile = fopen(outfilename, "rb"); + fseek(robotsfile, 0, SEEK_END); + fsize = ftell(robotsfile); + fseek(robotsfile, 0, SEEK_SET); // same as rewind(f); + + robotsfilestr = malloc(fsize + 1); + alloced=1; + if(fread(robotsfilestr, 1, fsize, robotsfile)){} + fclose(robotsfile); + + robotsfilestr[fsize] = 0; + //printf("%ld",fsize); + } + //parse the robots.txt file + if(response_code_checkrobots==200 || foundfile==1 && fsize > 11){ + int foundUserAgent=0,foundDisallow=0,foundAllow=0,comment=0,match=0; + int k=0,lenurlpath=strlen(rURLpath),rwupdated=0,result=1; + for(int i=0;i=lenurlpath) + match=0; + k++; + } + if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){ + result=0; + foundDisallow=0; + } + if(match==0) + foundDisallow=k=0; + } + //check if path is allowed in url + if(rwupdated==1 && foundAllow==1){ + if(rwb!=10 && rwb!=13){ + //get path + if(k=lenurlpath) + match=0; + k++; + } + if((i==fsize-1 && match==1) || ((rwb==10 || rwb==13) && match==1)){ + printf("Permitted."); + curl_easy_cleanup(curl); + curl_global_cleanup(); + if(alloced==1) + free(robotsfilestr); + return 1; + } + if(match==0) + foundAllow=k=0; + } + + if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"disallow:","DISALLOW:",9)==1){ + foundDisallow=1; + foundAllow=0; + k=0; + //printf("\nfound disallow"); + } + if(foundUserAgent==1 && rwupdated && locateInRWindow(rwindow,"\nallow:","\nALLOW:",7)==1){ + foundDisallow=0; + foundAllow=1; + k=0; + //printf("\nfound allow"); + } + } + rwupdated=0; + } + + if(result==0){ + printf("Denied."); + curl_easy_cleanup(curl); + curl_global_cleanup(); + if(alloced==1) + free(robotsfilestr); + return 0; + }else{ + printf("Permitted."); + curl_easy_cleanup(curl); + curl_global_cleanup(); + if(alloced==1) + free(robotsfilestr); + return 1; + } + } + printf("Permitted."); + curl_easy_cleanup(curl); + if(alloced==1) + free(robotsfilestr); + return 1; +} + + +int locateInRWindow(char *window, char *birdLower, char *birdUpper, int length) +{ + int start = rwindow_len-length; + for(int i=0;i +#include +#include +#include +//#include //RHEL/Rocky +//#include //RHEL/Rocky +#include //ubuntu 20/22 +#include //ubuntu 20/22 +#include "htmlparse.h" +#include "urlparse.h" +#include "checkrobots.h" +#include + +#define url_fromlist_arraylen 102400 +#define url_insert_arraylen 1024000 + +char /**title, *keywords, *description, *page,*/ *windexinsert, *windexupdate, *windexRandUpdate, *titlecheckinsert, /**shardinsert,*/ correctedURL[1001], urlPath_finalURL[1001], folderPath_finalURL[1001], urlPrefix_finalURL[1001], urlNPNP_finalURL[1001], strDepth[101], url_fromlist[url_fromlist_arraylen], url_insert[url_insert_arraylen], previousfail[5][1001]; + +FILE *shardfile; +char *shardfilestr; + +void finish_with_error(MYSQL *con) +{ + fprintf(stderr, "%s\n", mysql_error(con)); + mysql_close(con); + exit(1); +} +int isnum(char *source){ + int sourcelength = strlen(source); + for(int i=0;i < sourcelength; i++){ + if(source[i] < 48 || source[i] > 57){ + return 0; + } + } + return 1; +} +size_t write_data(void *ptr, size_t size, size_t nmemb, FILE *stream) { + size_t written = fwrite(ptr, size, nmemb, stream); + return written; +} + +int main(int argc, char **argv) +{ + int id_assigned=0,sleeptime=1; + if(argc >= 2 && isnum(argv[1])==1){ + if(argv[1][0] == 48){ + printf("When assigning ID's, you must start at 1. Cannot set an id of 0.\n"); + exit(0); + } + id_assigned=1; + } + if((argc >= 2 && isnum(argv[1])==0) || (argc >= 3 && isnum(argv[2])==0) || argc > 3){ + printf("\nWiby Web Crawler\n\nUsage: cr Crawler_ID Sleep_Time(s)\n\nThe indexqueue may have each page assigned a crawler ID. The ID is assigned when you specify to the Refresh Scheduler the total number of crawlers you are running, and when you update the variable '$num_crawlers' from inside of review.php and graveyard.php (line 73) to the number of crawlers you are using. The scheduler will assign pages in round-robin order a crawler ID within the range of that total.\n\nExample: If you want two crawlers running, then you should specify the first with an ID of 1, and the second with and ID of 2. Run them in separate folders, and provide a symlink to the 'robots' folder and 'shards' file in each. Each crawler will crawl pages in the indexqueue with its corresponding ID.\n\nYou can also not assign an ID, and in that case the crawler will ignore the ID assignments. So if you have only one crawler running, assigning an ID is optional unless you need to change the sleep time (then just use an ID of 1). Don't run multiple crawlers without assigning ID's.\n\nSpecify the total number of shard tables you wish to use in the 'shards' file. The crawler will round-robin insert/update rows in these tables (ws0 to wsX) along with the main 'windex' table. The default is 4.\n\nThe Sleep_Time is 1 second by default but can be set to 0 or higher, and is used when crawling hyperlinks is specified. It inserts a delay between each link that it crawls. This delay is not used between individual pages that were submitted by people.\n\n"); + exit(0); + } + if(argc >= 3){ + sleeptime = atoi(argv[2]); + } + + long int previousID[10] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + int sanity = 1; + + for(int i=0;i<5;i++){ + previousfail[i][0]=0; + } + + //check if there are shards to include + int nShards=0,fsize=0,shardnum=0; + char numshards[20], shardnumstr[20]; + memset(numshards,0,20); + memset(shardnumstr,0,20); + sprintf(shardnumstr,"0"); + if(shardfile = fopen("shards", "r")){ + fseek(shardfile, 0, SEEK_END); + fsize = ftell(shardfile); + fseek(shardfile, 0, SEEK_SET); + if(fsize > 0 && fsize < 11){ + shardfilestr = malloc(fsize + 1); + if(fread(shardfilestr, 1, fsize, shardfile)){} + shardfilestr[fsize] = 0; + for(int i=0;i10 || fsize<1){ + printf("\nTotal number of shards is not specified or too large.\n"); + exit(0); + } + fclose(shardfile); + }else{ + printf("\nWarning: 'shards' file is missing. Create the file and indicate the number of available shards you are using or set it to 0 if you aren't.\n\n"); + } + if(nShards > 0){ + srand(time(NULL)); + shardnum = (rand() % nShards); + memset(shardnumstr,0,20); + sprintf(shardnumstr,"%d",shardnum); + } + + while(1) + { + //printf("MySQL client version: %s\n", mysql_get_client_info()); + int alreadydone = 0, permitted=1; + //allocates or initialises a MYSQL object + + MYSQL *con = mysql_init(NULL); + + if (con == NULL) + { + finish_with_error(con); + } + + //establish a connection to the database. We provide connection handler, host name, user name and password parameters to the function. The other four parameters are the database name, port number, unix socket and finally the client flag + if (mysql_real_connect(con, "localhost", "crawler", "seekout", "wiby", 0, NULL, 0) == NULL) + { + finish_with_error(con); + } + + if (mysql_query(con, "SET CHARSET utf8;")) + { + finish_with_error(con); + } + + if(id_assigned == 0){ + if (mysql_query(con, "SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue limit 1;")) + { + finish_with_error(con); + } + }else{ + char indexqueuequery[2001]; + memset(indexqueuequery,0,2001); + strcpy(indexqueuequery,"SELECT id, url, worksafe, approver, surprise, updatable, task, crawl_tree, crawl_family, crawl_depth, crawl_pages, crawl_type, crawl_repeat, force_rules FROM indexqueue WHERE crawler_id = '"); + strcat(indexqueuequery,argv[1]); + strcat(indexqueuequery,"' LIMIT 1;"); + if (mysql_query(con, indexqueuequery)) + { + finish_with_error(con); + } + } + + //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set + MYSQL_RES *result = mysql_store_result(con); + + if(result == NULL) + { + finish_with_error(con); + } + + MYSQL_ROW row = mysql_fetch_row(result); + + int empty=0; + if(row == NULL){ + //printf("\nQueue is empty\n"); + empty=1; + }else{ + //convert shardnum to string + if(nShards > 0){ + sprintf(shardnumstr,"%d",shardnum); + //itoa(shardnum,shardnumstr,10); + } + if(id_assigned == 0){ + printf("-----------------------------------------------------------------------------------\nFetching:"); + }else{ + printf("-----------------------------------------------------------------------------------\ncr%s Fetching:",argv[1]); + } + //grab the first entry (fifo) + /*for(int i=0; i 4){ + if(url[4]==':' && (url[3]=='p' || url[3]=='P')) + http = 7; + } + if(urlsize > 5){ + if(url[5]==':' && (url[4]=='s' || url[4]=='S')) + https = 8; + } + if(urlsize > 11){ + if((url[7]=='w' || url[7]=='W') && (url[8]=='w' || url[8]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[10]=='.' ){ + httpwww = 11; + http = https = 0; + } + if(url[7]=='/' && (url[8]=='w' || url[8]=='W') && (url[9]=='w' || url[9]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[11]=='.' ){ + httpswww = 12; + http = https = 0; + } + } + + //set the prefix + + if(http > 0) strcat(prefix,"://"); + else if(https > 0) strcat(prefix,"s://"); + else if(httpwww > 0) strcat(prefix,"://www."); + else if(httpswww > 0) strcat(prefix,"s://www."); + + int prefixsize = httpswww+httpwww+https+http; + char urlnoprefix[urlsize-prefixsize+1]; + char urlnopathnoprefix[urlsize-prefixsize+1]; + memset(urlnoprefix,0,urlsize-prefixsize+1); + memset(urlnopathnoprefix,0,urlsize-prefixsize+1); + int urlcount=0,urlnoprefixcount=0,urlnopathnoprefix_done=0; + + //store the url without prefix to urlnoprefix + while(urlcount < urlsize+1) + { + if(urlcount>prefixsize-1) + { + urlnoprefix[urlnoprefixcount]=url[urlcount]; + //get urlnopath + if(url[urlcount] != '/' && urlnopathnoprefix_done==0){ + urlnopathnoprefix[urlnoprefixcount]=url[urlcount]; + }else{ + urlnopathnoprefix_done=1; + } + urlnoprefixcount++; + } + urlcount++; + } + + //check for '/' at end of url. it may be already indexed without that so we need to account for it. + //int urlnoprefixlength = strlen(urlnoprefix); + int slashfound = 0; + char urlnoprefixnoslash[urlnoprefixcount]; + memset(urlnoprefixnoslash,0,urlnoprefixcount); + if(urlnoprefix[urlnoprefixcount-1] == '/') + { + strncpy(urlnoprefixnoslash,urlnoprefix,urlnoprefixcount-1); + slashfound = 1; + } + //printf("\nurlnoprefix: %s\n",urlnoprefix); + + printf("Checking if page already exists in index... "); + int idexistsalready = 0, checkurlsize = urlnoprefixcount*24+1000; + char *idexistsvalue; + char checkurl[checkurlsize]; + memset(checkurl,0,checkurlsize); + if(task == 0 || task[0] == '2'){//index request did not come from refresh scheduler, or is an autocrawl url + //strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url FROM windex WHERE url = 'http://"); //replace this with a simple check for url_noprefix column match + strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url,url_noprefix,shard FROM windex WHERE url_noprefix = '"); + if(slashfound==0) + { + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html"); + strcat(checkurl,"' OR url_noprefix = '/index.htm"); + strcat(checkurl,"';"); + } + else + { + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefixnoslash); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm"); + strcat(checkurl,"';"); + } + }else{ + strcpy(checkurl,"SELECT id,updatable,title,enable,fault,url,url_noprefix,shard FROM windex WHERE url = '"); + strcat(checkurl,url); + strcat(checkurl,"';"); + } + + if (mysql_query(con, checkurl)) + { + finish_with_error(con); + } + + //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set + MYSQL_RES *resulturlcheck = mysql_store_result(con); + + if(resulturlcheck == NULL) + { + finish_with_error(con); + } + + //grab the first entry (fifo) + printf("Found ID "); + row = mysql_fetch_row(resulturlcheck); + char updatedefault[] = "1"; + char *updatableOldDBval = updatedefault; + char *enableOldDBval = updatedefault; + char *dbtitle; + char *fault; + char *dburl; + char *dburl_noprefix; + char *shard; + + //Catalog the previous crawl attempts (to see if they are all for the same page - which would be a bad sign) + previousID[9] = previousID[8]; + previousID[8] = previousID[7]; + previousID[7] = previousID[6]; + previousID[6] = previousID[5]; + previousID[5] = previousID[4]; + previousID[4] = previousID[3]; + previousID[3] = previousID[2]; + previousID[2] = previousID[1]; + previousID[1] = previousID[0]; + + if(row == NULL) + { + printf("null"); + previousID[0] = -1; + }else { + printf("%s",row[0]); + idexistsalready = 1; + idexistsvalue = row[0]; + previousID[0] = atoi(row[0]); + updatableOldDBval = row[1]; + dbtitle = row[2]; + enableOldDBval = row[3]; + fault = row[4]; + dburl=row[5]; + dburl_noprefix=row[6]; + shard=row[7]; + if(task != 0 && task[0]=='2') + alreadydone=1; + } + + //Log duplicate rows (they shouldn't exist) + int num_rows = mysql_num_rows(resulturlcheck); + if(num_rows > 1){ + FILE *duplicates = fopen("duplicates.txt", "a"); + fputs (dburl,duplicates); + fputs ("\r\n",duplicates); + fclose(duplicates); + } + + //check robots.txt file for this domain + urlparse(url); + //if(task != 0 && task[0]=='2'){ //enable this statement if you only want to check robots.txt when crawling through hyperlinks, but not on human submissions + permitted = checkrobots(prefix,rootdomain,urlPath); //comment this line out if you want to completely disable checking robots.txt + //} + + //Does this crawl attempt, along with the last 9 have the same ID? There is possibly a duplicate db entry, or some other problem. + if(previousID[0] != -1 && alreadydone==0){ + if(previousID[0] == previousID[9] && previousID[0] == previousID[8] && previousID[0] == previousID[7] && previousID[0] == previousID[6] && previousID[0] == previousID[5] && previousID[0] == previousID[4] && previousID[0] == previousID[3] && previousID[0] == previousID[2] && previousID[0] == previousID[1]){ + sanity = 0; + printf("\nWARNING: Last 10 crawl attempts are all for the same page. Will not continue crawling in this situation. Is the same page being submitted over and over? Also, duplicate table entries of the same URL in windex can cause this behavior. Check the database, and duplicates.txt\n\n"); + exit(0); + }else{ + sanity = 1; + } + + }else{ + sanity = 1; + } + + int failedcrawl=0; + if(task != 0 && task[0]=='2' && alreadydone==0 && permitted==1){ + //see if url failed to crawl last time (when link crawling) + //as it might come up multiple times during crawl of website, should avoid recrawling it + //will also check the database if this check passes + for(int i=0;i<5;i++){ + if(strcasecmp(previousfail[i], urlnoprefix)==0){ + sanity=0; + failedcrawl=1; + break; + } + } + if(sanity==1 && sleeptime > 0) + sleep(sleeptime);//do link crawling slowly, 1 second is default unless specified + } + + //if crawling through hyperlinks, doublecheck that this hyperlink hasn't been crawled recently, even if it was redirected elsewhere or failed + int alreadylogged = 0; + if(failedcrawl==0 && task !=0 && task[0]=='2' && alreadydone == 0){ + if (mysql_query(con, "use wibytemp")) + { + finish_with_error(con); + } + memset(checkurl,0,checkurlsize); + strcpy(checkurl,"SELECT id FROM crawled WHERE url_noprefix = '"); + if(slashfound==0) + { + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/"); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html"); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '/index.htm"); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR"); + }else{ + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '"); + strcat(checkurl,urlnoprefixnoslash); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html"); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm"); + strcat(checkurl,"' AND time > NOW() - INTERVAL 12 HOUR"); + } + //query db + if (mysql_query(con, checkurl)) + { + finish_with_error(con); + } + MYSQL_RES *resultcrawledurlcheck = mysql_store_result(con); + if(resultcrawledurlcheck == NULL) + { + finish_with_error(con); + } + //grab the first entry (fifo) + MYSQL_ROW rowCrawledURLCheck = mysql_fetch_row(resultcrawledurlcheck); + if(rowCrawledURLCheck != NULL) + { + sanity=0; + alreadylogged = 1; + printf("\nThis hyperlink was crawled recently. It cannot be crawled again for at least 12 hours."); + } + mysql_free_result(resultcrawledurlcheck); + if (mysql_query(con, "use wiby")) + { + finish_with_error(con); + } + } + + //printf("\n\n%ld, %ld, %ld, %ld, %ld\n",previousID[0],previousID[1],previousID[2],previousID[3],previousID[4]); + + //see if the server will accept http only connections on older browsers, change url to HTTP only: + char urlHTTP[strlen(url)+100]; + memset(urlHTTP,0,strlen(url)+100); + strcpy(urlHTTP,"http"); + if(http > 0 || https > 0){ + strcat(urlHTTP,"://"); + }else if(httpwww > 0 || httpswww > 0){ + strcat(urlHTTP,"://www."); + } + strcat(urlHTTP,urlnoprefix); + + if(updatableOldDBval[0] != '0' && enableOldDBval[0] != '0' && sanity == 1 && alreadydone==0 && permitted==1) + { + printf("\nAttempt HTTP connection: %s",urlHTTP); + printf("\nDownloading page... "); + //===============do the curl (download the webpage)===================== + curl_global_init(CURL_GLOBAL_DEFAULT); + CURL *curl; + FILE *fp; + CURLcode res; + char outfilename[FILENAME_MAX] = "page.out"; + curl = curl_easy_init(); + long size=0; + char *finalURL = NULL; + long response_code; + int finalURLsize=0, skipurl=0; + if (curl) { + fp = fopen(outfilename,"wb"); + //Get file size + //fseek(fp, 0L, SEEK_END); + //size = ftell(fp); + //set curl options + curl_easy_setopt(curl, CURLOPT_URL, urlHTTP);// set URL to get here + curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/4.0 (compatible; WebCrawler; SearchEngine)"); + curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, write_data);// send all data to this function // + curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);// write the page body to this file handle + curl_easy_setopt(curl,CURLOPT_FOLLOWLOCATION,1L);//allow redirects + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 60L); + curl_easy_setopt(curl, CURLOPT_CONNECTTIMEOUT, 20L); + curl_easy_setopt(curl, CURLOPT_MAXREDIRS, 5L);//max num of redirects + curl_easy_setopt(curl, CURLOPT_MAXFILESIZE, 5000000L);//don't download if over 5MB + curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);//0 or 1 to verify ssl + //curl_easy_setopt(curl, CURLOPT_VERBOSE, 1L);//set verbose + res = curl_easy_perform(curl);// get it! + //if(res == CURLE_OK) {//get final redirect url //-- don't check for this, causes segfault if "transfer closed with outstanding read data remaining" + curl_easy_getinfo(curl, CURLINFO_EFFECTIVE_URL, &finalURL); + curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &response_code); + + if(finalURL){ + printf("Effective URL: %s\nResponse: %ld, ", finalURL,response_code); + finalURLsize = strlen(finalURL); + } + + //curl_easy_cleanup(curl); //cleanup moved further down because finalURL is needed at insert + + //get file size + fseek(fp, 0L, SEEK_END); + size = ftell(fp); + + fclose(fp); + } + + //if effective URL contains ':443', CURL will fail to download this page on next update. Remove :443 from finalURL. + char *ptr_substring = NULL; + int substringpos=0; + if(finalURLsize > 3) + ptr_substring = strstr(finalURL,":443"); + if(ptr_substring != NULL && skipurl == 0 && finalURLsize<=500){ + substringpos = ptr_substring - finalURL; + int poscount = substringpos; + memcpy(correctedURL,finalURL,substringpos);//copy before substring + while(1){//copy after substring + correctedURL[poscount] = finalURL[poscount+4]; + if(finalURL[poscount+4] == 0) + break; + poscount++; + } + finalURL = correctedURL; + finalURLsize = strlen(finalURL); + printf("\nSetting final URL as: %s\n", finalURL); + } + + int finalURLcount=0; + while(finalURL[finalURLcount]!=0){ + if(finalURL[finalURLcount]=='\''){ + skipurl=1; + getURLs=0; + printf("\nURL contains single-quote. Skipping."); + } + finalURLcount++; + } + + //when crawling through hyperlinks, log that the url was accessed, use the original url, not finalURL + if(skipurl==0 && task != 0 && task[0]=='2'){ + if (mysql_query(con, "use wibytemp")) + { + finish_with_error(con); + } + char sqlquerylogurl[2000]; + memset(sqlquerylogurl,0,2000); + strcpy(sqlquerylogurl,"INSERT INTO crawled (url_noprefix) VALUES('"); + strcat(sqlquerylogurl,urlnoprefix); + strcat(sqlquerylogurl,"')"); + if (mysql_query(con, sqlquerylogurl)) + { + finish_with_error(con); + } + if (mysql_query(con, "use wiby")) + { + finish_with_error(con); + } + } + + if(finalURLsize>500){ + skipurl=1; + getURLs=0; + printf("\nURL is too long"); + } + + if(task != 0 && task[0]=='2' && canCrawl(finalURLsize,finalURL)==0){ + printf("\nEffective URL failed crawl rules."); + skipurl=1; + getURLs=0; + } + + char finalURLnoprefix[finalURLsize-prefixsize+100]; + char httpAllow[] = "0"; + memset(finalURLnoprefix,0,finalURLsize-prefixsize+100); + int updatereserve=0; + char idReserve[200]; + + if(skipurl==0){ + //see if server permitted an http connection + if(finalURL != NULL){ + if(finalURL[4]==':') + httpAllow[0] = '1'; + } + else if(http > 0 || httpwww > 0){ + httpAllow[0] = '1'; + } + + //Remove the prefix from the final URL, to store into url_noprefix + //find out if its http or https or http://www. or https://www. + httpwww=httpswww=http=https=0; + + if(finalURLsize > 4){ + if(finalURL[4]==':') + http = 7; + if(finalURL[4]=='s' || finalURL[4]=='S') + https = 8; + } + if(finalURLsize > 11){ + if((finalURL[7]=='w' || finalURL[7]=='W') && (finalURL[8]=='w' || finalURL[8]=='W') && ((finalURL[9]=='w' || finalURL[9]=='W') || finalURL[9]=='1' || finalURL[9]=='2' || finalURL[9]=='3') && finalURL[10]=='.' ){ + httpwww = 11; + http = https = 0; + } + if(finalURL[7]=='/' && (finalURL[8]=='w' || finalURL[8]=='W') && (finalURL[9]=='w' || finalURL[9]=='W') && ((finalURL[9]=='w' || finalURL[9]=='W') || finalURL[9]=='1' || finalURL[9]=='2' || finalURL[9]=='3') && finalURL[11]=='.' ){ + httpswww = 12; + http = https = 0; + } + } + + int finalURL_prefixsize = httpswww+httpwww+https+http, finalurlnoprefixcount = 0; + urlcount=0; + + //store the final url without prefix to finalURLnoprefix + while(finalURL[urlcount] != 0){ + if(urlcount>finalURL_prefixsize-1) + { + finalURLnoprefix[finalurlnoprefixcount]=finalURL[urlcount]; + finalurlnoprefixcount++; + } + urlcount++; + } + + //Double check that the URL is in fact not in the DB, by also searching for the effective URL from libcurl and its url in the table + int foundindoublecheck=0; + if(idexistsalready == 0){ + mysql_free_result(resulturlcheck); + char doublecheckurl[finalURLsize+100]; + memset(doublecheckurl,0,finalURLsize+100); + strcpy(doublecheckurl,"SELECT id,updatable,title,enable,fault,url,url_noprefix,shard FROM windex WHERE url_noprefix = '"); + strcat(doublecheckurl,finalURLnoprefix); + strcat(doublecheckurl,"';"); + if (mysql_query(con, doublecheckurl)) + { + finish_with_error(con); + } + resulturlcheck = mysql_store_result(con); + if(resulturlcheck == NULL) + { + finish_with_error(con); + } + row = mysql_fetch_row(resulturlcheck); + if(row != NULL) + { + printf("\nDoublechecked effective URL in windex, found ID %s\n",row[0]); + idexistsalready = 1; + idexistsvalue = row[0]; + previousID[0] = atoi(row[0]); + updatableOldDBval = row[1]; + dbtitle = row[2]; + enableOldDBval = row[3]; + fault = row[4]; + dburl=row[5]; + dburl_noprefix=row[6]; + shard=row[7]; + if((task != 0 && task[0]=='2') || updatableOldDBval[0] == '0') + alreadydone=1; + foundindoublecheck=1; + } + //Log duplicate rows (they shouldn't exist) + num_rows = mysql_num_rows(resulturlcheck); + if(num_rows > 1){ + FILE *duplicates = fopen("duplicates.txt", "a"); + fputs (dburl,duplicates); + fputs ("\r\n",duplicates); + fclose(duplicates); + } + //Does this crawl attempt, along with the last 9 have the same ID? There is possibly a duplicate db entry, or some other problem. + if(previousID[0] != -1){ + if(previousID[0] == previousID[9] && previousID[0] == previousID[8] && previousID[0] == previousID[7] && previousID[0] == previousID[6] && previousID[0] == previousID[5] && previousID[0] == previousID[4] && previousID[0] == previousID[3] && previousID[0] == previousID[2] && previousID[0] == previousID[1]){ + printf("\nWARNING: Last 10 crawl attempts are all for the same page. Will not continue crawling in this situation. Is the same page being submitted over and over? Also, duplicate table entries of the same URL in windex can cause this behavior. Check the database, and duplicates.txt\n\n"); + exit(0); + } + } + } + + //if doing an update when using multiple crawlers, reserve the id and verify the URL is still associated with it + if(alreadydone==0 && id_assigned==1 && idexistsalready==1){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(idReserve,0,200); + strcpy(idReserve,"INSERT into reserve_id (id, crawler_id) VALUES ("); + strcat(idReserve,idexistsvalue); + strcat(idReserve,","); + strcat(idReserve,argv[1]); + strcat(idReserve,");"); + if(mysql_query(con, idReserve)) + { + printf("\nID is already reserved, will try again. Clearing old reservations..."); + memset(idReserve,0,200); + strcpy(idReserve,"DELETE FROM reserve_id WHERE time < NOW() - INTERVAL 10 MINUTE OR crawler_id = "); + strcat(idReserve,argv[1]); + if(mysql_query(con, idReserve)){ + finish_with_error(con); + }else{ + printf(" Done."); + } + alreadydone=1; + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + updatereserve=1; + + //check that the url being updated is still assigned to that ID + memset(checkurl,0,checkurlsize); + if(task != 0 && task[0] == '1'){ + strcpy(checkurl,"SELECT id FROM windex WHERE url = '"); + strcat(checkurl,url); + strcat(checkurl,"';"); + }else{ + if(foundindoublecheck==0){ + strcpy(checkurl,"SELECT id FROM windex WHERE url_noprefix = '"); + if(slashfound==0) + { + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"/index.html"); + strcat(checkurl,"' OR url_noprefix = '/index.htm"); + strcat(checkurl,"';"); + }else{ + strcat(checkurl,urlnoprefix); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefixnoslash); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.html"); + strcat(checkurl,"' OR url_noprefix = '"); + strcat(checkurl,urlnoprefix);strcat(checkurl,"index.htm"); + strcat(checkurl,"';"); + } + }else{ + strcpy(checkurl,"SELECT id FROM windex WHERE url = '"); + strcat(checkurl,finalURL); + strcat(checkurl,"';"); + } + } + //query db + if (mysql_query(con, checkurl)) + { + finish_with_error(con); + } + MYSQL_RES *resulturlreservecheck = mysql_store_result(con); + if(resulturlcheck == NULL) + { + finish_with_error(con); + } + //grab the first entry (fifo) + char *URLcheckID; + MYSQL_ROW rowURLCheck = mysql_fetch_row(resulturlreservecheck); + if(rowURLCheck != NULL) + { + URLcheckID = rowURLCheck[0]; + } + if(URLcheckID != 0 && atoi(URLcheckID) != atoi(idexistsvalue)){ + printf("\nID was already reserved, will try again later."); + alreadydone=1; + } + mysql_free_result(resulturlreservecheck); + } + } + //=====================Extract text from HTML file======================= + if(size < 5000000 && skipurl==0 && alreadydone==0) + { + //switch on/off hyperlink collecting (if crawling through hyperlinks, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set) + if((task != 0 && task[0]=='2' && (n_crawl_depth > 0 || n_crawl_depth < 0) && (n_crawl_pages > 0 || n_crawl_pages < 0)) || (task==0 && (n_crawl_depth > 0 || n_crawl_depth < 0) && (n_crawl_pages > 0 || n_crawl_pages < 0)) || (task != 0 && task[0]=='1' && crawl_repeat != 0 && crawl_repeat[0]=='1' && (n_crawl_pages > 0 || n_crawl_pages < 0))){ + getURLs=1; + }else{ + getURLs=0; + } + + htmlparse(); + + //need the finalURL path info also + urlparse(finalURL); + memset(urlPath_finalURL,0,1001); + strcpy(urlPath_finalURL,urlPath); + memset(folderPath_finalURL,0,1001); + strcpy(folderPath_finalURL,folderPath); + memset(urlPrefix_finalURL,0,1001); + strcpy(urlPrefix_finalURL,prefix_fromlist); + memset(urlNPNP_finalURL,0,1001); + strcpy(urlNPNP_finalURL,urlnopathnoprefix_fromlist); + + if(urlPrefix_finalURL[0]==0 || urlNPNP_finalURL[0]==0 || urlPath_finalURL[0]==0) + noindex = 1; + + }else{ + noindex = 1; + } + + //check if rules are enforced (only for pages that are autocrawled) + if(force_rules != 0 && force_rules[0]=='1' && task != 0 && task[0]=='2' && noindex == 0 && response_code == 200){ + if(num_scripts > 2 || num_stylesheets > 2){ + noindex = 1; + printf("\nFailed rule check"); + } + } + + int skip = 0, titlechanged = 0, escape = 0, escapetotal = 0, redirected = 0; + //Check if noindex and size + //if(((noindex == 0 /*&& bodysize < 1900000*/ && bodysize > 10) || (noindex == 0 /*&& bodysize < 1900000*/ && descriptionsize > 10)) && response_code == 200 && alreadydone==0) + if((emptytitle == 0 || descriptionsize > 0 || bodysize > 0) && response_code == 200 && alreadydone==0 && noindex == 0) + { + //=================Allocate memory for the parsed text from htmlparse() + //title = (char*)calloc(titlesize+1,sizeof(char)); + //keywords = (char*)calloc(keywordssize+1,sizeof(char)); + //description = (char*)calloc(descriptionsize+1,sizeof(char)); + //page = (char*)calloc(bodysize+1,sizeof(char)); + windexinsert = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char)); + //shardinsert = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char)); + windexupdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char)); + windexRandUpdate = (char*)calloc(finalURLsize+urlnoprefixcount+bodysize+descriptionsize+keywordssize+titlesize+3001,sizeof(char)); + titlecheckinsert = (char*)calloc(finalURLsize+titlesize+1001,sizeof(char)); + + /*if(title == NULL || keywords == NULL || description == NULL || page == NULL || windexinsert == NULL || windexupdate == NULL) + { + printf("\nError allocating memory for webpage"); + //cleanup sql stuff + mysql_free_result(resulturlcheck); + mysql_free_result(result); + mysql_close(con); + exit(0); + }*/ + + + //Check if this is a new page: check if the title found in windex is the same as the parsed title. If not, put the page back into review. + int dbtitlesize = 0,titlecheckTitleSize = 0, dbNoTitle=0,extrapos=0; + if(idexistsalready==1) + { + //going to insert the crawled title into a "titlecheck" table with the url for reference, then we're going to read back the + //title and count the number of bytes vs what was read from dbtitlesize to determine if title changed + //this is because bytes read from db must be the same charset as what is crawled to get a proper count + //unsupported charsets can end up truncating data, giving incorrect title check, this method avoids that issue + + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + //set charset based on crawled page charset tag + if (mysql_query(con, mysqlcharset)) + { + finish_with_error(con); + } + //insert title into wibytemp for comparison + strcpy(titlecheckinsert,"INSERT INTO titlecheck (url,title) VALUES ('"); + strcat(titlecheckinsert,finalURL); + strcat(titlecheckinsert,"','"); + strcat(titlecheckinsert,title); + strcat(titlecheckinsert,"');"); + if (mysql_query(con, titlecheckinsert)) + { + finish_with_error(con); + } + if (mysql_query(con, "SET CHARSET utf8;")) + { + finish_with_error(con); + } + //now read back the title from the database + char checktitle[finalURLsize+dbtitlesize+1000]; + memset(checktitle,0,finalURLsize+dbtitlesize+1000); + strcpy(checktitle,"SELECT title FROM titlecheck WHERE url = '"); + strcat(checktitle,finalURL);strcat(checktitle,"' ORDER BY id DESC;"); + //query db + if (mysql_query(con, checktitle)) + { + finish_with_error(con); + } + MYSQL_RES *resulttitlecheck = mysql_store_result(con); + if(resulttitlecheck == NULL) + { + finish_with_error(con); + } + + //grab the first entry (fifo) + MYSQL_ROW rowTitleCheck = mysql_fetch_row(resulttitlecheck); + char *titlecheckTitle; + int titlecheckTitleSize = 0; + titlecheckTitle = rowTitleCheck[0]; + //printf("\n %s",rowTitleCheck[0]); + + //delete the entry from the table + char titlecheckremove[finalURLsize+1000]; + memset(titlecheckremove,0,finalURLsize+1000); + strcpy(titlecheckremove,"DELETE FROM titlecheck WHERE url ='"); + strcat(titlecheckremove,finalURL);strcat(titlecheckremove,"';"); + if (mysql_query(con, titlecheckremove)) + { + finish_with_error(con); + } + + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + + //check if original dburl is now getting redirected from finalurl (should be sent to review) + int finalURLnoprefix_size = strlen(finalURLnoprefix), dburl_noprefix_size = strlen(dburl_noprefix); + if(finalURLnoprefix_size != dburl_noprefix_size){ + redirected = 1; + printf("\nIndexed page is being redirected."); + }else{ + for(int i=0;i 0 && emptytitle == 0)) //previous, before db wibytemp titlecheck method + if((dbNoTitle == 0 && dbtitlesize != titlecheckTitleSize) || (dbNoTitle == 1 && titlesize > 0 && emptytitle == 0) || (URL_is_dbtitle == 1 && dbtitlesize != titlecheckTitleSize && titlesize > 0 && emptytitle == 0)) + { + titlechanged = 1; + } + //printf("\n|%s|\n%d\n%d\n%d\n%d\n%d",dbtitle,titlesize,dbtitlesize,extrapos,dbNoTitle,titlechanged); + + //cleanup some sql stuff + mysql_free_result(resulttitlecheck); + } + + if(titlechanged == 0 && redirected == 0) + { + //====================Load the parsed text into windex!================== + + if (mysql_query(con, mysqlcharset))//set charset based on page charset tag + { + finish_with_error(con); + } + + //strcpy(windexinsert,"INSERT INTO windex (url,title,tags,description,body,worksafe,enable,date,approver,surprise,updatable) VALUES ('"); + strcpy(windexinsert,"INSERT INTO windex (url,url_noprefix,title,description,body,worksafe,enable,date,approver,surprise,http,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,shard) VALUES ('"); + + strcpy(windexupdate,"UPDATE windex SET url = '"); + + int copiedRandom = 0; + int reserveFail = 0; + char randomreserve[100]; + char *randID; + char *randshard; + MYSQL_RES *resultRandID; + + if(idexistsalready == 0){//Insert new entry + //For search topics to be evenly discovered by all replicas or duplicate connections assigned to a specific search section, new rows must be scattered randomly across the database insead of sequental: + //Existing rows will be randomly selected and copied (inserted) into a new row at the bottom, and the new page will take the ID number of the old one through an update. + //select id from windex where enable = 1 order by rand() limit 1; + //insert into windex (url,title,tags,description,body,surprise,http,updatable,worksafe,enable,date,updated,approver,fault) select url,title,tags,description,body,surprise,http,updatable,worksafe,enable,date,updated,approver,fault from windex where id = 1338; + //the corresponding shard table will also be updated with the same ID and contents, which can be offloaded to another replica. + + + printf("\nInserting into index... "); + + if (mysql_query(con, "SELECT id, shard FROM windex WHERE enable = 1 ORDER BY rand() LIMIT 1;")) + { + finish_with_error(con); + } + resultRandID = mysql_store_result(con); + if (resultRandID==NULL) + { + finish_with_error(con); + } + MYSQL_ROW row = mysql_fetch_row(resultRandID); + if(row != NULL){ + randID = row[0]; + idexistsvalue = row[0]; + randshard = row[1]; + } + + //reserve the randomly selected ID when running more than one crawler + if(row != NULL && id_assigned==1){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(randomreserve,0,100); + strcpy(randomreserve,"INSERT into reserve_id (id) VALUES ("); + strcat(randomreserve,randID); + strcat(randomreserve,");"); + if (mysql_query(con, randomreserve)) + { + printf("\nID is already reserved. Clearing old reservations..."); + if(mysql_query(con, "DELETE FROM reserve_id WHERE time < NOW() - INTERVAL 10 MINUTE")){ + finish_with_error(con); + }else{ + printf(" Done."); + } + reserveFail=1;//if error: more than one crawler attempted to reserve the same randomly selected ID + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + } + + if(row == NULL || reserveFail==1){//if no rows in db yet or fails to reserve an ID + strcat(windexinsert,finalURL);strcat(windexinsert,"','"); + strcat(windexinsert,finalURLnoprefix);strcat(windexinsert,"','"); + //strcat(windexinsert,prefix);strcat(windexinsert,"','"); + if(titlesize > 0 && emptytitle == 0) { + strcat(windexinsert,title); + } + else { + if(finalURLsize < 111){ + strcat(windexinsert,finalURL); + } + else{ + strcat(windexinsert,"Untitled"); + } + } + strcat(windexinsert,"','"); + //if(tagsize > 0) {strcat(windexinsert,keywords);} + //strcat(windexinsert,"','"); + if(descriptionsize > 0) {strcat(windexinsert,description);} + strcat(windexinsert,"','"); + if(bodysize > 0) {strcat(windexinsert,body);} + strcat(windexinsert,"',"); + strcat(windexinsert,worksafe); + strcat(windexinsert,",1,now(),'"); + strcat(windexinsert,approver); + strcat(windexinsert,"',"); + strcat(windexinsert,surprise); + strcat(windexinsert,","); + strcat(windexinsert,httpAllow); + strcat(windexinsert,","); + strcat(windexinsert,updatable); + if(task != 0 && task[0]=='2'){//came from link crawling + strcat(windexinsert,",'"); + strcat(windexinsert,crawl_tree); + strcat(windexinsert,"','"); + strcat(windexinsert,crawl_family); + strcat(windexinsert,"',"); + strcat(windexinsert,crawl_pages); + strcat(windexinsert,","); + strcat(windexinsert,crawl_type); + strcat(windexinsert,","); + strcat(windexinsert,"0"); + strcat(windexinsert,", force_rules = "); + strcat(windexinsert,force_rules); + }else{ + strcat(windexinsert,","); + strcat(windexinsert,"NULL,"); + strcat(windexinsert,"NULL,"); + strcat(windexinsert,crawl_pages); + strcat(windexinsert,","); + strcat(windexinsert,crawl_type); + strcat(windexinsert,","); + strcat(windexinsert,crawl_repeat); + strcat(windexinsert,", force_rules = "); + strcat(windexinsert,force_rules); + } + strcat(windexinsert,","); + strcat(windexinsert,shardnumstr); + strcat(windexinsert,")"); + if (mysql_query(con, windexinsert)) + { + finish_with_error(con); + } + + //insert into the shard table for the new row + if(nShards>0){ + memset(windexinsert,0,strlen(windexinsert)); + strcpy(windexinsert,"INSERT INTO ws"); + strcat(windexinsert,shardnumstr); + strcat(windexinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID();"); + /*//get the last ID + MYSQL_RES *resultIDnum; + char *lastIDnum; + + if (mysql_query(con, "SELECT LAST_INSERT_ID() FROM windex limit 1")) + { + finish_with_error(con); + } + MYSQL_ROW rowLastID = mysql_fetch_row(resultIDnum); + if(rowLastID != NULL){ + lastIDnum = rowLastID[0]; + } + + strcpy(shardinsert,"INSERT INTO ws"); + strcat(shardinsert,shardnumstr); + strcat(shardinsert," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = "); + strcat(shardinsert,lastIDnum); + if (mysql_query(con, shardinsert)) + { + finish_with_error(con); + } + mysql_free_result(resultIDnum); */ + if (mysql_query(con, windexinsert)) + { + finish_with_error(con); + } + } + } + else{ + //copy contents of randomly selected row to a new row in windex. + strcpy(windexRandUpdate,"INSERT INTO windex (url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = "); + strcat(windexRandUpdate,randID); + if (mysql_query(con, windexRandUpdate)) + { + finish_with_error(con); + } + if(nShards>0){ + //Also copy that new row into a new row of the same ID in the round-robin assigned shard table + //update the shard id in windex + memset(windexRandUpdate,0,strlen(windexRandUpdate)); + strcpy(windexRandUpdate,"UPDATE windex set shard = "); + strcat(windexRandUpdate,shardnumstr); + strcat(windexRandUpdate," WHERE id = LAST_INSERT_ID()"); + if (mysql_query(con, windexRandUpdate)) + { + finish_with_error(con); + } + //insert that row into the next shard + memset(windexRandUpdate,0,strlen(windexRandUpdate)); + strcpy(windexRandUpdate,"INSERT INTO ws"); + strcat(windexRandUpdate,shardnumstr); + strcat(windexRandUpdate," (id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard) SELECT id,url,url_noprefix,title,tags,description,body,surprise,http,updatable,worksafe,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,enable,date,updated,approver,fault,shard FROM windex WHERE id = LAST_INSERT_ID()"); + if (mysql_query(con, windexRandUpdate)) + { + finish_with_error(con); + } + + //Overwrite the randomly selected row with the contents of the newly crawled webpage + memset(windexRandUpdate,0,strlen(windexRandUpdate)); + strcpy(windexRandUpdate,"UPDATE windex SET url = '"); + strcat(windexRandUpdate,finalURL); + strcat(windexRandUpdate,"', url_noprefix = '"); + strcat(windexRandUpdate,finalURLnoprefix); + strcat(windexRandUpdate,"', title = '"); + if(titlesize > 0 && emptytitle == 0){ + strcat(windexRandUpdate,title); + } + else{ + if(finalURLsize < 111){ + strcat(windexRandUpdate,finalURL); + } + else{ + strcat(windexRandUpdate,"Untitled"); + } + } + strcat(windexRandUpdate,"', tags = NULL, description = '"); + strcat(windexRandUpdate,description); + strcat(windexRandUpdate,"', body = '"); + strcat(windexRandUpdate,body); + strcat(windexRandUpdate,"', worksafe = "); + strcat(windexRandUpdate,worksafe); + strcat(windexRandUpdate,", approver = '"); + strcat(windexRandUpdate,approver); + strcat(windexRandUpdate,"', surprise = "); + strcat(windexRandUpdate,surprise); + strcat(windexRandUpdate,", http = "); + strcat(windexRandUpdate,httpAllow); + strcat(windexRandUpdate,", updatable = "); + strcat(windexRandUpdate,updatable); + if(task==0){//didn't come from refresh or link crawling + strcat(windexRandUpdate,", crawl_tree = NULL"); + strcat(windexRandUpdate,", crawl_family = NULL"); + strcat(windexRandUpdate,", crawl_pages = "); + strcat(windexRandUpdate,crawl_pages); + strcat(windexRandUpdate,", crawl_type = "); + strcat(windexRandUpdate,crawl_type); + strcat(windexRandUpdate,", crawl_repeat = "); + strcat(windexRandUpdate,crawl_repeat); + strcat(windexRandUpdate,", force_rules = "); + strcat(windexRandUpdate,force_rules); + }else if(task != 0 && task[0]=='2'){//came from link crawling + strcat(windexRandUpdate,", crawl_tree = '"); + strcat(windexRandUpdate,crawl_tree); + strcat(windexRandUpdate,"', crawl_family ='"); + strcat(windexRandUpdate,crawl_family); + strcat(windexRandUpdate,"', crawl_pages = "); + strcat(windexRandUpdate,crawl_pages); + strcat(windexRandUpdate,", crawl_type = "); + strcat(windexRandUpdate,crawl_type); + strcat(windexRandUpdate,", crawl_repeat = "); + strcat(windexRandUpdate,"0"); + strcat(windexRandUpdate,", force_rules = "); + strcat(windexRandUpdate,force_rules); + } + strcat(windexRandUpdate,", updated = CURRENT_TIMESTAMP, date = now(), fault = 0 WHERE id = "); + strcat(windexRandUpdate,randID); + if (mysql_query(con, windexRandUpdate)) + { + finish_with_error(con); + } + + //Finally, update the corresponding shard table row + if(randshard != 0){ + memset(windexRandUpdate,0,strlen(windexRandUpdate)); + strcpy(windexRandUpdate,"UPDATE ws"); + strcat(windexRandUpdate,randshard); + strcat(windexRandUpdate," SET url = '"); + strcat(windexRandUpdate,finalURL); + strcat(windexRandUpdate,"', url_noprefix = '"); + strcat(windexRandUpdate,finalURLnoprefix); + strcat(windexRandUpdate,"', title = '"); + if(titlesize > 0 && emptytitle == 0){ + strcat(windexRandUpdate,title_filtered); + } + else{ + if(finalURLsize < 111){ + strcat(windexRandUpdate,finalURL); + } + else{ + strcat(windexRandUpdate,"Untitled"); + } + } + strcat(windexRandUpdate,"', tags = NULL, description = '"); + strcat(windexRandUpdate,description_filtered); + strcat(windexRandUpdate,"', body = '"); + strcat(windexRandUpdate,body_filtered); + strcat(windexRandUpdate,"', worksafe = "); + strcat(windexRandUpdate,worksafe); + strcat(windexRandUpdate,", approver = '"); + strcat(windexRandUpdate,approver); + strcat(windexRandUpdate,"', surprise = "); + strcat(windexRandUpdate,surprise); + strcat(windexRandUpdate,", http = "); + strcat(windexRandUpdate,httpAllow); + strcat(windexRandUpdate,", updatable = "); + strcat(windexRandUpdate,updatable); + if(task==0){//didn't come from refresh or link crawling + strcat(windexRandUpdate,", crawl_tree = NULL"); + strcat(windexRandUpdate,", crawl_family = NULL"); + strcat(windexRandUpdate,", crawl_pages = "); + strcat(windexRandUpdate,crawl_pages); + strcat(windexRandUpdate,", crawl_type = "); + strcat(windexRandUpdate,crawl_type); + strcat(windexRandUpdate,", crawl_repeat = "); + strcat(windexRandUpdate,crawl_repeat); + strcat(windexRandUpdate,", force_rules = "); + strcat(windexRandUpdate,force_rules); + }else if(task != 0 && task[0]=='2'){//came from link crawling + strcat(windexRandUpdate,", crawl_tree = '"); + strcat(windexRandUpdate,crawl_tree); + strcat(windexRandUpdate,"', crawl_family ='"); + strcat(windexRandUpdate,crawl_family); + strcat(windexRandUpdate,"', crawl_pages = "); + strcat(windexRandUpdate,crawl_pages); + strcat(windexRandUpdate,", crawl_type = "); + strcat(windexRandUpdate,crawl_type); + strcat(windexRandUpdate,", crawl_repeat = "); + strcat(windexRandUpdate,"0"); + strcat(windexRandUpdate,", force_rules = "); + strcat(windexRandUpdate,force_rules); + } + strcat(windexRandUpdate,", updated = CURRENT_TIMESTAMP, date = now(), fault = 0 WHERE id = "); + strcat(windexRandUpdate,randID); + if (mysql_query(con, windexRandUpdate)) + { + finish_with_error(con); + } + } + } + copiedRandom = 1; + } + } + if(idexistsalready == 1 || (copiedRandom == 1 && nShards == 0)){ //update an existing entry or a new entry with no shard listed in row + if(idexistsalready == 1) + printf("\nUpdating index... "); + strcat(windexupdate,finalURL); + strcat(windexupdate,"', url_noprefix = '"); + strcat(windexupdate,finalURLnoprefix); + strcat(windexupdate,"', title = '"); + if(titlesize > 0 && emptytitle == 0){ + strcat(windexupdate,title); + } + else{ + if(finalURLsize < 111){ + strcat(windexupdate,finalURL); + } + else{ + strcat(windexupdate,"Untitled"); + } + } + if(copiedRandom == 0)//normal update + strcat(windexupdate,"', description = '"); + else{ + strcat(windexupdate,"', tags = NULL, description = '"); + } + strcat(windexupdate,description); + strcat(windexupdate,"', body = '"); + strcat(windexupdate,body); + strcat(windexupdate,"', worksafe = "); + strcat(windexupdate,worksafe); + if(copiedRandom == 1){ + strcat(windexupdate,", approver = '"); + strcat(windexupdate,approver); + strcat(windexupdate,"'"); + } + strcat(windexupdate,", surprise = "); + strcat(windexupdate,surprise); + strcat(windexupdate,", http = "); + strcat(windexupdate,httpAllow); + strcat(windexupdate,", updatable = "); + strcat(windexupdate,updatable); + if(task==0){//didn't come from refresh or link crawling + if(idexistsalready == 0){ + strcat(windexupdate,", crawl_tree = NULL"); + strcat(windexupdate,", crawl_family = NULL"); + } + strcat(windexupdate,", crawl_pages = "); + strcat(windexupdate,crawl_pages); + strcat(windexupdate,", crawl_type = "); + strcat(windexupdate,crawl_type); + strcat(windexupdate,", crawl_repeat = "); + strcat(windexupdate,crawl_repeat); + strcat(windexupdate,", force_rules = "); + strcat(windexupdate,force_rules); + }else if(task != 0 && task[0]=='2' && idexistsalready == 0){//came from link crawling + strcat(windexupdate,", crawl_tree = '"); + strcat(windexupdate,crawl_tree); + strcat(windexupdate,"', crawl_family ='"); + strcat(windexupdate,crawl_family); + strcat(windexupdate,"', crawl_pages = "); + strcat(windexupdate,crawl_pages); + strcat(windexupdate,", crawl_type = "); + strcat(windexupdate,crawl_type); + strcat(windexupdate,", crawl_repeat = "); + strcat(windexupdate,"0"); + strcat(windexupdate,", force_rules = "); + strcat(windexupdate,force_rules); + } + if(copiedRandom == 0)//normal update + strcat(windexupdate,", updated = CURRENT_TIMESTAMP, fault = 0 WHERE id = "); + else + strcat(windexupdate,", updated = CURRENT_TIMESTAMP, date = now(), fault = 0 WHERE id = "); + strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row + if (mysql_query(con, windexupdate)) + { + finish_with_error(con); + } + + //update shard + if(nShards>0 && idexistsalready == 1 && shard != 0){ + memset(windexupdate,0,strlen(windexupdate)); + strcpy(windexupdate,"UPDATE ws"); + strcat(windexupdate,shard); + strcat(windexupdate," SET url = '"); + strcat(windexupdate,finalURL); + strcat(windexupdate,"', url_noprefix = '"); + strcat(windexupdate,finalURLnoprefix); + strcat(windexupdate,"', title = '"); + if(titlesize > 0 && emptytitle == 0){ + strcat(windexupdate,title_filtered); + } + else{ + if(finalURLsize < 111){ + strcat(windexupdate,finalURL); + } + else{ + strcat(windexupdate,"Untitled"); + } + } + if(copiedRandom == 0)//normal update + strcat(windexupdate,"', description = '"); + else{ + strcat(windexupdate,"', tags = NULL, description = '"); + } + strcat(windexupdate,description_filtered); + strcat(windexupdate,"', body = '"); + strcat(windexupdate,body_filtered); + strcat(windexupdate,"', worksafe = "); + strcat(windexupdate,worksafe); + //strcat(windexupdate,", approver = '"); + //strcat(windexupdate,approver); + //strcat(windexupdate,"', surprise = "); + strcat(windexupdate,", surprise = "); + strcat(windexupdate,surprise); + strcat(windexupdate,", http = "); + strcat(windexupdate,httpAllow); + strcat(windexupdate,", updatable = "); + strcat(windexupdate,updatable); + if(task==0){//didn't come from refresh or link crawling + strcat(windexupdate,", crawl_pages = "); + strcat(windexupdate,crawl_pages); + strcat(windexupdate,", crawl_type = "); + strcat(windexupdate,crawl_type); + strcat(windexupdate,", crawl_repeat = "); + strcat(windexupdate,crawl_repeat); + strcat(windexupdate,", force_rules = "); + strcat(windexupdate,force_rules); + } + strcat(windexupdate,", updated = CURRENT_TIMESTAMP, fault = 0 WHERE id = "); + strcat(windexupdate,idexistsvalue);//will be same as randID if a new page is replacing that row + if (mysql_query(con, windexupdate)) + { + finish_with_error(con); + } + } + } + + //unreserve randomly selected ID + if(id_assigned==1 && idexistsalready==0 && reserveFail==0){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(randomreserve,0,100); + strcpy(randomreserve,"DELETE FROM reserve_id where id = "); + strcat(randomreserve,randID); + strcat(randomreserve,";"); + if (mysql_query(con, randomreserve)) + { + finish_with_error(con); + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + } + //unreserve ID if doing an update + if(id_assigned==1 && updatereserve==1){ + if (mysql_query(con, "use wibytemp;")) + { + finish_with_error(con); + } + memset(idReserve,0,200); + strcpy(idReserve,"DELETE FROM reserve_id where id = "); + strcat(idReserve,idexistsvalue); + strcat(idReserve,";"); + if(mysql_query(con, idReserve)) + { + finish_with_error(con); + } + //back to wiby database + if (mysql_query(con, "use wiby;")) + { + finish_with_error(con); + } + } + //free result + if(idexistsalready == 0){ + mysql_free_result(resultRandID); + } + + //===================remove the entry from the indexqueue=============== + //printf("\nRemoving from queue..."); + char sqlqueryremove[200]; + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id="); + strcat(sqlqueryremove,id);strcat(sqlqueryremove,";"); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + + printf("\n\nSuccess!"); + } + //clear page from memory + free(windexinsert); free(windexupdate); free(titlecheckinsert); free(windexRandUpdate); //free(shardinsert); + }else{ + skip = 1; + } + + if((skip == 1 || titlechanged == 1 || redirected == 1)){ + //from skip check: if(((noindex == 0 && bodysize < 1900000 && bodysize > 10) || (noindex == 0 && bodysize < 1900000 && descriptionsize > 10)) && response_code == 200 && alreadydone==0) + //printf("\nnoindex: %d\nbodysize: %ld\ndescriptionsize %ld\nresponse_code: %d\nalreadydone: %d\nskip: %d\ntitlechanged: %d\nredirected: %d",noindex,bodysize,descriptionsize,response_code,alreadydone,skip,titlechanged,redirected); + if(skip == 1){ + printf("\nDoesn't want to be indexed, size too big, 404, already done, failed rules, or security issue."); + //log previous failed link crawls + strcpy(previousfail[4],previousfail[3]); + strcpy(previousfail[3],previousfail[2]); + strcpy(previousfail[2],previousfail[1]); + strcpy(previousfail[1],previousfail[0]); + strcpy(previousfail[0],urlnoprefix); + } + printf("\nRemoving from queue..."); + char sqlqueryremove[200]; + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id="); + strcat(sqlqueryremove,id);strcat(sqlqueryremove,";"); + + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + if(alreadydone==0){ + if(idexistsalready == 1 && fault[0] == '1') + { + if(crawl_family != 0 && crawl_family[0] !='0'){ + printf("\nPage may no longer exist. Originated from link crawling. Removing from the index."); + FILE *abandoned = fopen("abandoned.txt", "a"); + fputs (url,abandoned); + fputs ("\r\n",abandoned); + fclose(abandoned); + }else{ + printf("\nPage may no longer exist. Moving to review."); + } + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM windex WHERE id ="); + strcat(sqlqueryremove,idexistsvalue); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + if(nShards > 0 && shard != 0){ + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM ws"); + strcat(sqlqueryremove,shard); + strcat(sqlqueryremove," WHERE id = "); + strcat(sqlqueryremove,idexistsvalue); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + } + if(crawl_family == 0 || (crawl_family != 0 && crawl_family[0] =='0')){ + char sqlqueryreview[1001]; + memset(sqlqueryreview,0,1001); + strcpy(sqlqueryreview,"INSERT INTO reviewqueue (url,worksafe) VALUES ('"); + strcat(sqlqueryreview,url);strcat(sqlqueryreview,"',"); + strcat(sqlqueryreview,worksafe);strcat(sqlqueryreview,");"); + if (mysql_query(con, sqlqueryreview)) + { + finish_with_error(con); + } + } + } + else if(idexistsalready == 1 && fault[0] != '1')//mark that there is a fault with the page, crawler will throw it back into review if it happens again + { + printf("\nFault found. Will try again later."); + char sqlqueryfault[450]; + memset(sqlqueryfault,0,450); + strcpy(sqlqueryfault,"UPDATE windex SET updated = CURRENT_TIMESTAMP, fault = 1 WHERE id = "); + strcat(sqlqueryfault,idexistsvalue); + if (mysql_query(con, sqlqueryfault)) + { + finish_with_error(con); + } + if(nShards>0 && shard != 0){ + memset(sqlqueryfault,0,450); + strcpy(sqlqueryfault,"UPDATE ws"); + strcat(sqlqueryfault,shard); + strcat(sqlqueryfault," SET updated = CURRENT_TIMESTAMP, fault = 1 WHERE id = "); + strcat(sqlqueryfault,idexistsvalue); + if (mysql_query(con, sqlqueryfault)) + { + finish_with_error(con); + } + } + } + else + { + FILE *abandoned = fopen("abandoned.txt", "a"); + fputs (url,abandoned); + fputs ("\r\n",abandoned); + fclose(abandoned); + } + } + } + //check if link crawling is specified, will still collect and crawl hyperlinks on pages that fail the rule check (when checked). + //make sure duplicates don't get crawled more than once + //check db if its already indexed too - do this at beginning instead? + //crawl links if crawling through hyperlinks, or from regular refresh while crawl_repeat is on, or during manual submission when appropriate limits are set + if(nofollow==0 && getURLs==1 && alreadydone==0 && titlechanged == 0 && redirected == 0 && (emptytitle == 0 || descriptionsize > 0 || bodysize > 0) && response_code == 200){ + //cycle through url list, then construct an sql string around it, then insert it to indexqueue; + + //force crawl depth of 1 during a refresh if crawl_repeat is set + if(crawl_repeat != 0 && crawl_repeat[0]=='1' && task != 0 && task[0]=='1'){ + n_crawl_depth=1; + } + + if(n_crawl_depth>0)//below 0 = unlimited depth + n_crawl_depth--; + + memset(strDepth,0,101); + sprintf(strDepth,"%d",n_crawl_depth); + //itoa(n_crawl_depth,strDepth,10); + + memset(url_fromlist,0,url_fromlist_arraylen); + memset(url_insert,0,url_insert_arraylen); + int loopcount=0,elementnum=0,urls=0; + if(id_assigned == 1){ + strcpy(url_insert,"INSERT INTO indexqueue (url,worksafe,approver,surprise,task,crawl_tree,crawl_family,crawl_depth,crawl_pages,crawl_type,crawl_repeat,force_rules,crawler_id) VALUES ("); + }else{ + strcpy(url_insert,"INSERT INTO indexqueue (url,worksafe,approver,surprise,task,crawl_tree,crawl_family,crawl_depth,crawl_pages,crawl_type,crawl_repeat,force_rules) VALUES ("); + } + while(urlListShuffled[loopcount]!=0){ + switch(urlListShuffled[loopcount]){ + case '\n' ://see if url can be indexed, if so, add to sql insert statement + if(strlen(url_fromlist) < 500){ + urlparse(url_fromlist); + + //check if internal or external url + int isinternal=1; + if(rootdomain[0]!=0){ + isinternal=0; + }else if(url_fromlist[4]==':' || url_fromlist[5]==':'){ + isinternal=0; + }else if((url_fromlist[0]=='w' || url_fromlist[0]=='W') && (url_fromlist[1]=='w' || url_fromlist[1]=='W') && (url_fromlist[2]=='w' || url_fromlist[2]=='W') && url_fromlist[3]=='.'){ + isinternal=0; + } + int urlNPNP_finalURL_len=strlen(urlNPNP_finalURL); + int isabsolute=0; + if(isinternal==0 && urlNPNP_finalURL_len==strlen(urlnopathnoprefix_fromlist)){ + isinternal=isabsolute=1; + for(int q=0;q1){ + strcat(url_insert,", ("); + } + strcat(url_insert,"'"); + strcat(url_insert,urlPrefix_finalURL); + strcat(url_insert,urlNPNP_finalURL); + strcat(url_insert,url_fromlist); + strcat(url_insert,"',"); + strcat(url_insert,worksafe); + strcat(url_insert,",'"); + strcat(url_insert,approver); + strcat(url_insert,"',0,2,'"); + if(task==0){ + strcat(url_insert,url); + }else{ + strcat(url_insert,crawl_tree); + } + strcat(url_insert,"','"); + strcat(url_insert,finalURL); + strcat(url_insert,"',"); + strcat(url_insert,strDepth); + strcat(url_insert,","); + strcat(url_insert,crawl_pages); + strcat(url_insert,","); + strcat(url_insert,crawl_type); + strcat(url_insert,","); + strcat(url_insert,"0"); + strcat(url_insert,","); + strcat(url_insert,force_rules); + if(id_assigned == 1){ + strcat(url_insert,","); + strcat(url_insert,argv[1]); + } + strcat(url_insert,")"); + }else if(url_fromlist[0] != '/' && url_fromlist[0] != '.'){ + urls++; + if(urls>1){ + strcat(url_insert,", ("); + } + strcat(url_insert,"'"); + if(isabsolute==0){ + strcat(url_insert,urlPrefix_finalURL); + strcat(url_insert,urlNPNP_finalURL); + strcat(url_insert,folderPath_finalURL); + strcat(url_insert,urlcopy);//scrubed index.html + }else{ + strcat(url_insert,urlcopy); + } + strcat(url_insert,"',"); + strcat(url_insert,worksafe); + strcat(url_insert,",'"); + strcat(url_insert,approver); + strcat(url_insert,"',0,2,'"); + if(task==0){ + strcat(url_insert,url); + }else{ + strcat(url_insert,crawl_tree); + } + strcat(url_insert,"','"); + strcat(url_insert,finalURL); + strcat(url_insert,"',"); + strcat(url_insert,strDepth); + strcat(url_insert,","); + strcat(url_insert,crawl_pages); + strcat(url_insert,","); + strcat(url_insert,crawl_type); + strcat(url_insert,","); + strcat(url_insert,"0"); + strcat(url_insert,","); + strcat(url_insert,force_rules); + if(id_assigned == 1){ + strcat(url_insert,","); + strcat(url_insert,argv[1]); + } + strcat(url_insert,")"); + } + }else if(isinternal==0 && crawl_type != 0 && crawl_type[0] != '0'){//is external link + if(url_fromlist[0] != '.'){ + urls++; + if(urls>1){ + strcat(url_insert,", ("); + } + strcat(url_insert,"'"); + strcat(url_insert,prefix_fromlist); + strcat(url_insert,rootdomain); + strcat(url_insert,urlPath); + strcat(url_insert,"',"); + strcat(url_insert,worksafe); + strcat(url_insert,",'"); + strcat(url_insert,approver); + strcat(url_insert,"',0,2,'"); + if(task==0){ + strcat(url_insert,url); + }else{ + strcat(url_insert,crawl_tree); + } + strcat(url_insert,"','"); + strcat(url_insert,finalURL); + strcat(url_insert,"',"); + strcat(url_insert,strDepth); + strcat(url_insert,","); + strcat(url_insert,crawl_pages); + strcat(url_insert,","); + strcat(url_insert,crawl_type); + strcat(url_insert,","); + strcat(url_insert,"0"); + strcat(url_insert,","); + strcat(url_insert,force_rules); + if(id_assigned == 1){ + strcat(url_insert,","); + strcat(url_insert,argv[1]); + } + strcat(url_insert,")"); + } + } + } + memset(url_fromlist,0,url_fromlist_arraylen); + elementnum=0; + loopcount++; + break; + default : + if(loopcount(url_insert_arraylen-10000)) + break; + } + if(urls>0){ + strcat(url_insert,";"); + //insert into db + if (mysql_query(con, url_insert)) + { + finish_with_error(con); + } + } + } + if (curl){ + curl_easy_cleanup(curl);// cleanup curl (finalURL used at inserts, thats why we cleanup and the end here + curl_global_cleanup(); + } + }else{ + if(alreadydone == 0){ + printf("\nPage was flagged as unable to crawl or banned."); + }else if(idexistsalready==1){ + printf("\nPage is already indexed."); + } + printf("\nRemoving from queue..."); + char sqlqueryremove[200]; + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM indexqueue WHERE id="); + strcat(sqlqueryremove,id); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + if(idexistsalready==1 && permitted==0){ + printf(" Removing from index..."); + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM windex WHERE id="); + strcat(sqlqueryremove,idexistsvalue); + strcat(sqlqueryremove," AND updatable != '0'"); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + if(nShards>0 && shard != 0){ + memset(sqlqueryremove,0,200); + strcpy(sqlqueryremove,"DELETE FROM ws"); + strcat(sqlqueryremove,shard); + strcat(sqlqueryremove," WHERE id="); + strcat(sqlqueryremove,idexistsvalue); + strcat(sqlqueryremove," AND updatable != '0'"); + if (mysql_query(con, sqlqueryremove)) + { + finish_with_error(con); + } + } + } + FILE *abandoned = fopen("abandoned.txt", "a"); + fputs (url,abandoned); + fputs ("\r\n",abandoned); + fclose(abandoned); + } + + //cleanup more sql stuff + mysql_free_result(resulturlcheck); + + //rotate shard for next insert + if(nShards > 0){ + shardnum++; + if(shardnum == nShards) + shardnum=0; + sprintf(shardnumstr,"%d",shardnum); + } + printf(" Awaiting next page in queue...\n\n"); + } + //cleanup more sql stuff + mysql_free_result(result); + mysql_close(con); + + if(empty==1) + sleep(5);//sleep 5 seconds + } + exit(0); +} diff --git a/c/htmlparse.h b/c/htmlparse.h new file mode 100755 index 0000000..fc2a64c --- /dev/null +++ b/c/htmlparse.h @@ -0,0 +1,630 @@ +//Wiby HTML Parser +//Separates text from an HTML file +//Remember to also set sql_mode = "NO_BACKSLASH_ESCAPES" in my.cnf + +#include +#include +#include +#include + +#define window_len 100 +#define charset_len 100 +#define mysqlcharset_len 100 +#define title_len 144 +#define keywords_len 1024 +#define description_len 182 +#define robots_len 100 +#define body_len 81920 +#define urlList_len 102400 +#define strURL_len 102400 + +FILE *bodyfile,*titlefile, *keywordsfile, *descriptionfile, *noindexfile, *nofollowfile, *charsetfile, *urlfile, *shuffledurlfile; + +static char filename[] = "page.out"; + +char window[window_len],windowWithSpaces[window_len],charset[charset_len+1],mysqlcharset[mysqlcharset_len+1],title[title_len+1],keywords[keywords_len+1],description[description_len+1],robots[robots_len+1],body[body_len+1]; +char urlList[urlList_len+1],strURL[strURL_len+1],urlListShuffled[urlList_len+1],urlListHoldShuffled[urlList_len+1]; +char title_filtered[title_len+1], body_filtered[body_len+1], description_filtered[description_len+1]; +int titlefound=0,charsetfound=0,descriptionfound=0,keywordsfound=0,robotsfound=0,nofollow=0,noindex=0,scriptfound=0,stylefound=0,urlFound=0,urlTagFound=0,numURL=0,emptytitle=1,spaces=0,seeded=0,num_stylesheets=0,num_scripts=0,getURLs=1; +long charsetsize=0,titlesize=0,keywordssize=0,descriptionsize=0,robotssize=0,bodysize=0; + +int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match); +int locateInWindow(char *window, char *birdLower, char *birdUpper, int length); +int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize); +int canCrawl(int urlSize, char *urltocheck); +void shuffleURLs(int iterations, long urlListSize); +void sqlsafe(); +void charset2mysql(); +void filtervars(); + +FILE *f; +char *fileStr; +char c; + +void htmlparse(){ + long urlListSize=0; + numURL=0; + int intag=0,incomment=0,inscript=0,instyle=0,inlink=0,putspace=0,spacecount=0,foundbr=0; + int urlSize=0,dqcount=0; + titlefound=charsetfound=descriptionfound=keywordsfound=robotsfound=nofollow=noindex=scriptfound=stylefound=num_stylesheets=num_scripts=0; + charsetsize=titlesize=keywordssize=descriptionsize=robotssize=bodysize=0; + + memset(window,'#',window_len); +// window[window_len]=0; + memset(windowWithSpaces,'#',window_len); +// windowWithSpaces[window_len]=0; + memset(charset,0,charset_len+1); + memset(mysqlcharset,0,mysqlcharset_len+1); + memset(title,0,title_len+1); + memset(keywords,0,keywords_len+1); + memset(description,0,description_len+1); + memset(robots,0,robots_len+1); + memset(body,0,body_len+1); + memset(urlList,0,urlList_len+1); + memset(strURL,0,strURL_len+1); + memset(urlListShuffled,0,urlList_len+1); + memset(urlListHoldShuffled,0,urlList_len+1); + memset(title_filtered,0,title_len+1); + memset(body_filtered,0,body_len+1); + memset(description_filtered,0,description_len+1); + printf("Parsing HTML... "); + + //open html file and load into memory + f = fopen(filename, "rb"); + fseek(f, 0, SEEK_END); + long fsize = ftell(f); + fseek(f, 0, SEEK_SET); /* same as rewind(f); */ + + fileStr = malloc(fsize + 1); + if(fread(fileStr, 1, fsize, f)){}; + fclose(f); + + fileStr[fsize] = 0; + + //Locate the charset, title, description, keywords, robots, body + //must accomodate human error in markup + //must double all single quotes for mysql safety + //dont allow extra whitespace, ignore cr/lf/tabs + //complete it all in one pass + + for(int i=0;i= (title_len-2)) + titlefound=3; + } + if(locateInWindow(window,"","",8)==1 && titlefound!=3){ + titlefound = 3; + //remove from end of title by inserting null at location of < + titlesize -= 8; + if(titlesize < 0){ //avoids this: + titlesize = 0; + emptytitle = 1; + } + title[titlesize] = 0; + //printf("\n%s",title); + } + } + if(titlefound == 1 && c=='>')//in case of this situation: + titlefound = 2; + if(titlefound == 0 && locateInWindow(window,"<title","<TITLE",6)==1){ + titlefound = 1; + } + + //Get Charset + if(charsetfound == 1){ + if(c == '>' || c == '/'){ + charsetfound = 2; + //printf("\n%s",charset); + } + if(charsetfound == 1 && charsetsize < charset_len && c != '"' && c != '\'' && skipchar == 0){ + charset[charsetsize]=c; + charsetsize++; + } + } + if(charsetfound == 0 && locateInWindow(window,"charset=","CHARSET=",8)==1){ + charsetfound = 1; + } + + //Get Description + if(descriptionfound == 1){ + if(c == '>' || c == '/'){ + descriptionfound = 2; + //printf("\n%s",description); + } + if(descriptionfound == 1 && descriptionsize < (description_len-2) && c != '"' && skipchar == 0){ + description[descriptionsize]=c; + descriptionsize++; + if(c == 39){//check for single quotes and double them up for sql safety + description[descriptionsize]=c; + descriptionsize++; + } + } + } + if(descriptionfound == 0 && locateInWindow(window,"description\"content=","DESCRIPTION\"CONTENT=",20)==1){ + descriptionfound = 1; + } + + //Get Keywords + if(keywordsfound == 1){ + if(c == '>' || c == '/'){ + keywordsfound = 2; + //printf("\n%s",keywords); + } + if(keywordsfound == 1 && keywordssize < (keywords_len-2) && c != '"' && skipchar == 0){ + keywords[keywordssize]=c; + keywordssize++; + if(c == 39){//check for single quotes and double them up for sql safety + keywords[keywordssize]=c; + keywordssize++; + } + } + } + if(keywordsfound == 0 && locateInWindow(window,"keywords\"content=","KEYWORDS\"CONTENT=",17)==1){ + keywordsfound = 1; + } + + //Get Robots (nofollow, noindex) + if(robotsfound == 1){ + if(c == '>' || c == '/'){ + robotsfound = 2; + //printf("\n%s",robots); + if(locateInWindow(window,"nofollow","NOFOLLOW",8)==1) + nofollow=1; + if(locateInWindow(window,"noindex","NOINDEX",7)==1 || locateInWindow(window,"none","NONE",4)==1) + noindex=nofollow=1; + } + if(robotsfound == 1 && robotssize < robots_len && c != '"' && c != '\'' && skipchar == 0){ + robots[robotssize]=c; + robotssize++; + } + } + if(robotsfound == 0 && locateInWindow(window,"robots\"content=","ROBOTS\"CONTENT=",15)==1){ + robotsfound = 1; + } + + if(titlefound != 2){ + //Ignore between scripts, styles, and remove all tags, repeated spaces, tabs, cr, lf, null, add a space at end of every tag + if(c=='<'){ + intag = 1; + }else if(c=='>'){ + intag = 0; + putspace = 1; + } + + if(locateInWindow(window,"<!--","<!--",4)==1){ + incomment = 1; + }else if(locateInWindow(window,"-->","-->",3)==1){ + incomment = 0; + } + + if(locateInWindow(window,"<script","<SCRIPT",7)==1 && c != ' ' && skipchar == 0){ + inscript = 1; + num_scripts++; + }else if(locateInWindow(window,"</script>","</SCRIPT>",9)==1){ + inscript = 0; + } + + if(locateInWindow(window,"<style","<STYLE",6)==1 && c != ' ' && skipchar == 0){ + instyle = 1; + num_stylesheets++; + }else if(locateInWindow(window,"</style>","</STYLE>",8)==1){ + instyle = 0; + } + + if(locateInWindow(window,"<link","<LINK",5)==1){ + inlink = 1; + }else if(inlink==1 && locateInWindow(window,">",">",1)==1){ + inlink = 0; + } + if(inlink==1){ + if(locateInWindow(window,".css",".CSS",4)==1 && c != ' ' && skipchar == 0) + num_stylesheets++; + } + + //Get Body + //exclude remaining tags, comments, scripts, styles, cr, lf, null, tab, add a space after a '>' but only allow one + if(intag == 0 && incomment == 0 && inscript == 0 && instyle == 0 && inlink == 0 && skipchar == 0 && bodysize < (body_len-2)){ + if(putspace == 1){ + if(spacecount == 0){ + body[bodysize]=32; + bodysize++; + } + spacecount++; + putspace=0; + }else{ + if(c==32) + spacecount++; + else spacecount = 0; + + if(spacecount < 2){ + body[bodysize]=c; + bodysize++; + + if(c == 39){//check for single quotes and double them up for sql safety + body[bodysize]=c; + bodysize++; + } + } + } + } + } + + //Get URL's + if(getURLs==1){ + if(urlFound == 1 && incomment==0 && instyle==0 && inscript==0 && inlink == 0){ + if(c=='"' || c=='\'') + dqcount++; + if((c == '#' && urlSize==0) || (dqcount == 2 && urlSize == 0) || (c == ' ' && urlSize == 0)) + urlFound=urlTagFound=dqcount=0; + if((c == '>' || c == ' ') && urlFound == 1){ + if(canCrawl(urlSize,strURL)==0 || (urlSize+urlListSize) >= (urlList_len-1)){ + memset(strURL,0,strURL_len+1); + }else{ + strcat(urlList,strURL); + strcat(urlList,"\n"); + urlListSize+=urlSize+1; + memset(strURL,0,strURL_len+1); + numURL++; + } + urlFound = urlTagFound = urlSize = dqcount = 0; + } + if(urlFound == 1 && urlListSize < (urlList_len-2) && c != '"' && c != '\'' && urlSize < (strURL_len-2)){ + strURL[urlSize]=window[window_len-1]; + urlSize++; + } + if(urlSize==11){ + if(locateInWindow(window,"javascript:","JAVASCRIPT:",11)==1){ + urlFound=urlTagFound=urlSize=dqcount=0; + memset(strURL,0,strURL_len+1); + } + } + } + if(urlFound == 0 && urlTagFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && locateInWindow(windowWithSpaces,"<a ","<A ",3)==1){//sometimes there is something between "<a" and "href" + urlTagFound = 1; + } + if(urlFound == 0 && incomment == 0 && instyle == 0 && inscript == 0 && inlink == 0 && (locateInWindow(window,"ahref=","AHREF=",6)==1 || (urlTagFound == 1 && locateInWindow(window,"href=","HREF=",5)==1))){ + urlFound = 1; + } + } + } + + //Convert charset to mysql equivalent + charset2mysql(); + + //Filter additional characters *if* required + filtervars(); + + //print body to file +/* bodyfile = fopen("body.txt","wb"); + fputs(body,bodyfile); + fclose(bodyfile); + + //print title to file + titlefile = fopen("title.txt","wb"); + fputs(title,titlefile); + fclose(titlefile); + + //print keywords to file + keywordsfile = fopen("keywords.txt","wb"); + fputs(keywords,keywordsfile); + fclose(keywordsfile); + + //print description to file + descriptionfile = fopen("description.txt","wb"); + fputs(description,descriptionfile); + fclose(descriptionfile); + + //print charset to file + charsetfile = fopen("charset.txt","wb"); + fputs(mysqlcharset,charsetfile); + fclose(charsetfile); + + //print noindex to file + noindexfile = fopen("noindex.txt","wb"); + if(noindex==1) + fputs("noindex",noindexfile); + fclose(noindexfile); + + //print nofollow to file + nofollowfile = fopen("nofollow.txt","wb"); + if(nofollow==1) + fputs("nofollow",nofollowfile); + fclose(nofollowfile);*/ + + if(getURLs==1){ + //shuffle order of collected URLs list + shuffleURLs(10,urlListSize); + //printf("\n%s",urlList); + + /*//print URLs to file + urlfile = fopen("url.txt","wb"); + fputs(urlList,urlfile); + fclose(urlfile); + + //print shuffled URLs to file + shuffledurlfile = fopen("urlshuffled.txt","wb"); + fputs(urlListShuffled,shuffledurlfile); + fclose(shuffledurlfile); */ + } + + free(fileStr); + + printf("\nbody: %ld, title: %ld, charset: %ld, description: %ld, keywords: %ld, noindex: %d, nofollow: %d",bodysize,titlesize,charsetsize,descriptionsize,keywordssize,noindex,nofollow); +} + +void shuffleURLs(int iterations, long urlListSize) +{ + if(seeded==0){ + srand(time(NULL)); + seeded=1; + } + + int r1,r2,r1to2; + int urlCount,i,j,k,l; + + if(numURL>2){ + strcpy(urlListHoldShuffled,urlList); + for(int loops=0;loops<iterations;loops++){ + r1 = r1to2 = (rand() % numURL) + 1; + r2 = (rand() % numURL) + 1; + + if(r1>r2){ + r1=r2; + r2=r1to2; + } + if(r1==r2){ + continue; + } + + urlCount=i=j=k=l=0; + + //skip to url number r1 + while(urlCount < r1 /*&& i<urlList_len*/){ + if(urlListHoldShuffled[i]=='\n') + urlCount++; + i++; + } + j=i; + //copy to urlListShuffled starting at j until reaching r2 location + while(urlCount<r2 /*&& j<urlList_len*/){ + urlListShuffled[k]=urlListHoldShuffled[j]; + if(urlListHoldShuffled[j]=='\n') + urlCount++; + j++; + k++; + } + //concat url's before i + while(l<i /*&& k<urlList_len*/){ + urlListShuffled[k]=urlListHoldShuffled[l]; + l++; + k++; + } + //concat url's after k + while(k<urlListSize /*&& k<urlList_len*/){ + urlListShuffled[k]=urlListHoldShuffled[k]; + k++; + } + strcpy(urlListHoldShuffled,urlListShuffled); + } + }else{ + strcpy(urlListShuffled,urlList); + } + +} + +void charset2mysql() +{ + //if no charset specified, use utf8 + if(charsetsize == 0){ + strcpy(mysqlcharset,"SET CHARSET utf8;"); + printf("No Charset found. %s",mysqlcharset); + } + else{ //else, match charset with a proper mysql charset + + if(matchMySQLcharset(charsetsize,charset,5,"utf-8","UTF-8")==1){ + strcpy(mysqlcharset,"SET CHARSET utf8mb4;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"latin1","LATIN1")==1){ + strcpy(mysqlcharset,"SET CHARSET latin1;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,9,"shift-jis","SHIFT-JIS")==1){ + strcpy(mysqlcharset,"SET CHARSET cp932;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"x-sjis","X-SJIS")==1){ + strcpy(mysqlcharset,"SET CHARSET cp932;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,10,"iso-8859-1","ISO-8859-1")==1){ + strcpy(mysqlcharset,"SET CHARSET latin1;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,12,"windows-1252","WINDOWS-1252")==1){ + strcpy(mysqlcharset,"SET CHARSET latin1;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,12,"windows-1251","WINDOWS-1251")==1){ + strcpy(mysqlcharset,"SET CHARSET cp1251;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,12,"windows-1250","WINDOWS-1250")==1){ + strcpy(mysqlcharset,"SET CHARSET cp1250;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"koi8-r","KOI8-R")==1){ + strcpy(mysqlcharset,"SET CHARSET cp1251;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,6,"euc-kr","EUC-KR")==1){ + strcpy(mysqlcharset,"SET CHARSET euckr;"); + printf("%s",mysqlcharset); + } + else if(matchMySQLcharset(charsetsize,charset,4,"big5","BIG5")==1){ + strcpy(mysqlcharset,"SET CHARSET big5;"); + printf("%s",mysqlcharset); + } + else{ + strcpy(mysqlcharset,"SET CHARSET utf8;"); + printf("Charset mismatch. %s",mysqlcharset); + } + } +} + +int matchMySQLcharset(int html_charset_length, char *html_charset, int html_match_length, char *html_lowercase_match, char *html_uppercase_match) +{ + int match = 0; + int i=0; + for(;i<html_match_length;i++){ + if(i > html_charset_length){ + return 0; + } + if(html_charset[i] != 95 && html_charset[i] != 45 && html_lowercase_match[i] != 95 && html_lowercase_match[i] != 45){ // _ or - + if(html_lowercase_match[i] != html_charset[i] && html_uppercase_match[i] != html_charset[i]){ + return 0; + } + } + match = 1; + } + return match; +} + +int locateInWindow(char *window, char *birdLower, char *birdUpper, int length) +{ + int start = window_len-length; + for(int i=0;i<length;i++){ + if(window[start] != birdLower[i] && window[start] != birdUpper[i]){ + return 0; + } + start++; + } + return 1; +} + +int locateInURL(char *url, char *birdLower, char *birdUpper, int length, int urlSize) +{ + long start = urlSize-length; + if(urlSize >= length){ + for(int i=0;i<length;i++){ + if(url[start] != birdLower[i] && url[start] != birdUpper[i]){ + return 0; + } + start++; + } + return 1; + }else{ + return 0; + } +} + +//Check if url can be indexed (allow relative links for html and txt files. Removing this check will add to the queue everything listed including external links. +int canCrawl(int urlSize, char *urltocheck){ + int numDots=0,numSlash=0; + int slashpos=0,dotspos=0; + int extfound=0,extlocation=0,prefixfound=0; + + for(int i=0;i<urlSize;i++){ + if(urlSize>5 && urltocheck[i]==':' && i>3){ + if((urltocheck[0]!='h' && urltocheck[0]!='H') || (urltocheck[1]!='t' && urltocheck[1]!='T') || (urltocheck[2]!='t' && urltocheck[2]!='T') || (urltocheck[3]!='p' && urltocheck[3]!='P') || (urltocheck[4]!='s' && urltocheck[4]!='S' && urltocheck[4]!=':') || (urltocheck[5]!=':' && urltocheck[5]!='/')) + return 0; + prefixfound=1; + } + if(urltocheck[i]=='?' || urltocheck[i]=='\\' || urltocheck[i] == '"' || urltocheck[i] == '\'' || urltocheck[i] == ' '){ + return 0; + } + if(urltocheck[i]=='.'){ + numDots++; + } + if(urltocheck[i]=='/'){ + numSlash++; + } + if(urltocheck[i]=='.' ){ + extfound=1; + extlocation=i; + } + if(urltocheck[i]=='/' && extfound==1 && i>extlocation){ + extfound=0; + } + if(prefixfound==1 && numSlash-2<=0){ + extfound=0; + } + } + if(numDots == 0){ + return 1; + } + + //restrict file extensions to these + if(extfound==1 && (locateInURL(urltocheck,".html",".HTML",5,urlSize)==1 || locateInURL(urltocheck,".htm",".HTM",4,urlSize)==1 || locateInURL(urltocheck,".txt",".TXT",4,urlSize)==1 || locateInURL(urltocheck,".php",".PHP",4,urlSize)==1 || locateInURL(urltocheck,".asp",".ASP",4,urlSize)==1 || locateInURL(urltocheck,".xhtml",".XHTML",6,urlSize)==1 || locateInURL(urltocheck,".shtml",".SHTML",6,urlSize)==1)){ + return 1; + } + if(extfound==0 ) + return 1; + return 0; +} + +void filtervars(){ + //Creates a copy of title, description, body variables with single-quotes filtered out + //will be used for the shard tables, but not on the primary 'windex' table + //allows a more restrictive query to be used. Is agnostic to searches containing single-quotes as a compromise + + //filter title + int j=0; + for(int i=0;i<titlesize;i++){ + if(title[i]!=39){ + title_filtered[j]=title[i]; + j++; + } + } + + //filter description + j=0; + for(int i=0;i<descriptionsize;i++){ + if(description[i]!=39){ + description_filtered[j]=description[i]; + j++; + } + } + + //filter body + j=0; + for(int i=0;i<bodysize;i++){ + if(body[i]!=39){ + body_filtered[j]=body[i]; + j++; + } + } +} diff --git a/c/page.out b/c/page.out new file mode 100644 index 0000000..6bbb492 --- /dev/null +++ b/c/page.out @@ -0,0 +1,268 @@ +<!DOCTYPE html> +<html> + <head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <style> + body { + background-image: url(spacepixel.gif); + background-size: cover; + margin: 0; + padding: 0; + } + +</style> + <link href="/style.css" rel="stylesheet" type="text/css" media="all"> + <style> +img { + display: block; + margin: 0 auto; +} +</style> +<style> +.gradient-text { + background-image: linear-gradient(to bottom, #ff5e62, #ff9966, #ffa366, #ffcc99, #a7c5eb, #6d7fcc); + -webkit-background-clip: text; + -webkit-text-fill-color: transparent; +} + +.updates-box { + background-color: #000; + color: #BF40BF; + font-family: 'Comic Sans MS', Comic, cursive; + font-size: 1.2em; + line-height: 1.5; + padding: 20px; + border: 3px solid #BF40BF; + border-radius: 10px; +} + +.hey { + *, +*::before, +*::after { + box-sizing: border-box; +} + +:root { + --color-primary: #f6aca2; + --color-secondary: #f49b90; + --color-tertiary: #f28b7d; + --color-quaternary: #f07a6a; + --color-quinary: #ee6352; + /* + --color-primary: #5192ED; + --color-secondary: #69A1F0; + --color-tertiary: #7EAEF2; + --color-quaternary: #90BAF5; + --color-quinary: #A2C4F5; + */ +} + +body { + min-height: 100vh; + font-family: canada-type-gibson, sans-serif; + font-weight: 300; + font-size: 1.25rem; + display: flex; + flex-direction: column; + justify-content: center; + overflow: hidden; +} + +.content { + display: flex; + align-content: center; + justify-content: center; +} + +.text_shadows { + text-shadow: 3px 3px 0 var(--color-secondary), 6px 6px 0 var(--color-tertiary), + 9px 9px var(--color-quaternary), 12px 12px 0 var(--color-quinary); + font-family: bungee, sans-serif; + font-weight: 400; + text-transform: uppercase; + font-size: calc(2rem + 5vw); + text-align: center; + margin: 0; + color: var(--color-primary); + //color: transparent; + //background-color: white; + //background-clip: text; + animation: shadows 1.2s ease-in infinite, move 1.2s ease-in infinite; + letter-spacing: 0.4rem; +} + +@keyframes shadows { + 0% { + text-shadow: none; + } + 10% { + text-shadow: 3px 3px 0 var(--color-secondary); + } + 20% { + text-shadow: 3px 3px 0 var(--color-secondary), + 6px 6px 0 var(--color-tertiary); + } + 30% { + text-shadow: 3px 3px 0 var(--color-secondary), + 6px 6px 0 var(--color-tertiary), 9px 9px var(--color-quaternary); + } + 40% { + text-shadow: 3px 3px 0 var(--color-secondary), + 6px 6px 0 var(--color-tertiary), 9px 9px var(--color-quaternary), + 12px 12px 0 var(--color-quinary); + } + 50% { + text-shadow: 3px 3px 0 var(--color-secondary), + 6px 6px 0 var(--color-tertiary), 9px 9px var(--color-quaternary), + 12px 12px 0 var(--color-quinary); + } + 60% { + text-shadow: 3px 3px 0 var(--color-secondary), + 6px 6px 0 var(--color-tertiary), 9px 9px var(--color-quaternary), + 12px 12px 0 var(--color-quinary); + } + 70% { + text-shadow: 3px 3px 0 var(--color-secondary), + 6px 6px 0 var(--color-tertiary), 9px 9px var(--color-quaternary); + } + 80% { + text-shadow: 3px 3px 0 var(--color-secondary), + 6px 6px 0 var(--color-tertiary); + } + 90% { + text-shadow: 3px 3px 0 var(--color-secondary); + } + 100% { + text-shadow: none; + } +} + +@keyframes move { + 0% { + transform: translate(0px, 0px); + } + 40% { + transform: translate(-12px, -12px); + } + 50% { + transform: translate(-12px, -12px); + } + 60% { + transform: translate(-12px, -12px); + } + 100% { + transform: translate(0px, 0px); + } +} + +} + +.gradient-text span { + animation: gradient 15s ease-in-out infinite; +} + +@keyframes gradient { + 0% { + background-position: 0% 0%; + } + 50% { + background-position: 100% 0%; + } + 100% { + background-position: 0% 0%; + } +} + .gif-container { + display: flex; + justify-content: center; + align-items: center; + flex-wrap: wrap; + } + + .gif-container img { + max-width: 100%; + margin: 0 10px; + } + + a { + text-decoration: none; + color: #fff; + display: block; + margin: 20px 0; + text-align: center; + font-size: 1.5rem; + } + + img { + display: block; + margin: 0 auto; + max-width: 100%; + } + </style> + </head> + <body> + </body> +</html> + </head> + <body> + <img src="mew's corner.gif"> + <div class="content"></h2> + </div> + <h1 class="gradient-text">Welcome to Mew's corner!。゚(₌இᆽஇ₌)♡I'm a quantitative analyst, artist, philosopher, world record holder in some videogames, most profilific fanfiction writer of multiple niche fandoms, and a human that's just doing her best to do her best, which often leaves a lot to be desired. They say spirituality isn't about being perfect, but recognizing our imperfections so we can continually become more. I like that a lot. (₌ ᵕ̣̣̣̣̣ ᆽ ̣̣̣̣̣̣̣ᵕ) + <br><br> + + <a href="https://mewscorner.neocities.org/coding_adventures"> + <h2 class="gradient-text">༒Coding adventures༒</h2> + <img src="linux.gif"> + </a> + + <a href="https://mewscorner.neocities.org/fanfiction"> + <h2 class="gradient-text">༒Fanfiction༒</h2> + <img src="book.gif"> + </a> + + <a href="https://mewscorner.neocities.org/philosophy"> + <h2 class="gradient-text">༒Philosophy༒</h2> + <img src="philosopher.gif"> + </a> + + <a href="https://mewscorner.neocities.org/gay"> + <h2 class="gradient-text">༒Super Secret Page༒</h2> + <img src="sailor_moon_cry.gif"> + </a> + + <h1 class="updates-box"><br>UPDATES: <br><br> + <a href="https://mewscorner.neocities.org/yh" target="_blank"><img src="new.gif"> + </h1> + + <h1 class="updates-box">update 1.2 2025年4月2日 【木蛇】<br><br> + Added some older pages that had been removed<br> + <br><br> + <a href="https://mewscorner.neocities.org/coding_adventures" target="_blank"><img src="new.gif"> + </h1> + + <h1 class="updates-box">update 1.2 2024年11月15日 【木龍】<br><br> + Update boxes added.<br>Added Philosophy Portfolio. <br> Added code portfolio. + <br><br> + <a href="https://mewscorner.neocities.org/coding_adventures" target="_blank"><img src="new.gif"> + </h1> + + + + <div class="gif-container"> + <img src="torch.gif"> + <img style="margin-right: -15px" src="sword.gif"> + <img src="crystal.gif"> + <img style="margin-left: -15px; transform: rotateY(180deg)" src="sword.gif"> + <img src="torch.gif"> +</div> + + + + <img src="dragons.gif"> + <img src="Under_Construction_Bar.gif"> + + </body> +</html> diff --git a/c/rs b/c/rs new file mode 100755 index 0000000..0cfc7cf Binary files /dev/null and b/c/rs differ diff --git a/c/rs.c b/c/rs.c new file mode 100755 index 0000000..0134b8f --- /dev/null +++ b/c/rs.c @@ -0,0 +1,224 @@ +//wiby refresh scheduler + +#include </usr/include/mysql/mysql.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <ctype.h> + +void finish_with_error(MYSQL *con) +{ + fprintf(stderr, "%s\n", mysql_error(con)); + mysql_close(con); + exit(1); +} +void help(){ + printf("\nWiby Refresh Scheduler\n\nUsage: rs Batch_Limit Total_Crawlers\n\nThe refresh scheduler finds pages that need to be refreshed and adds them to the indexqueue to be crawled. It will wait for the batch to complete before adding more.\n\nThere are two arguments you can set, the max number of pages to grab for each batch, and the total number of crawlers available.\n\nIf you set no arguments, it assumes you have one crawler running with an unassigned ID or an ID of 1, and will set a limit of ten pages per batch, rechecking if it finishes every 5 seconds.\n\nIf you have two crawlers running and a batch limit of 100 pages, this is how you would run the scheduler:\n\n./rs 100 2\n\nIn that example, each crawler will be assigned 50 pages. Once all 100 have been crawled, another batch will be assigned.\n\nYou can also specify only a batch limit and omit the total number of crawlers, it will then assume one crawler with an unassigned ID or ID of 1 by default.\n\nThe program will sleep for 60 seconds if there are no stale pages found.\n\nIf you notice pages are not being updated when expected, you may have to increase the batch limit or add another crawler.\n\n"); + exit(0); +} +int isnum(char *source){ + int sourcelength = strlen(source); + for(int i=0;i < sourcelength; i++){ + if(source[i] < 48 || source[i] > 57){ + return 0; + } + } + return 1; +} + +int main(int argc, char **argv) +{ + int wait_batch = 0,n_lim=10,num_cr=0,cr_count=1; + char lim[100] = "10"; + + if(argc == 3 && isnum(argv[2])==1 && isnum(argv[1])==1){ + num_cr = atoi(argv[2]); + n_lim = atoi(argv[1]); + }else if(argc == 2 && isnum(argv[1])==1){ + n_lim = atoi(argv[1]); + }else if(argc > 1){ + help(); + } + if(n_lim > 0 && argc > 1){ + strcpy(lim,argv[1]); + } + + while(1) + { + //allocates or initialises a MYSQL object + MYSQL *con = mysql_init(NULL); + + if (con == NULL) + { + finish_with_error(con); + } + + //establish a connection to the database. We provide connection handler, host name, user name and password parameters to the function. The other four parameters are the database name, port number, unix socket and finally the client flag + if (mysql_real_connect(con, "localhost", "crawler", "seekout", NULL, 0, NULL, 0) == NULL) + { + finish_with_error(con); + } + + if (mysql_query(con, "use wiby")) + { + finish_with_error(con); + } + + //check if indexqueue has rows from a previous batch sent by the scheduler (should not insert more until it's empty) + if (mysql_query(con, "SELECT id FROM indexqueue WHERE task = 1")) + { + finish_with_error(con); + } + + //We get the result set using the mysql_store_result() function. The MYSQL_RES is a structure for holding a result set + MYSQL_RES *result = mysql_store_result(con); + + if(result == NULL) + { + finish_with_error(con); + } + + int num_rows = 0; + int re_rows = mysql_num_rows(result); + mysql_free_result(result); + + if(re_rows > 0){ + mysql_close(con); + if(wait_batch == 0){ + printf("\nWaiting for batch to complete...\n\n"); + } + wait_batch = 1; + }else{ + wait_batch = 0; + char querywindex[1000]; + memset(querywindex,0,1000); + strcpy(querywindex,"SELECT id,url,worksafe,approver,surprise,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules FROM windex WHERE (CASE WHEN updatable = 1 THEN updated < NOW() - INTERVAL 1 WEEK WHEN updatable = 2 THEN updated < NOW() - INTERVAL 1 DAY WHEN updatable = 3 THEN updated < NOW() - INTERVAL 12 HOUR WHEN updatable = 4 THEN updated < NOW() - INTERVAL 6 HOUR WHEN updatable = 5 THEN updated < NOW() - INTERVAL 3 HOUR WHEN updatable = 6 THEN updated < NOW() - INTERVAL 1 HOUR END) AND updatable != 0 AND enable = 1 LIMIT "); + strcat(querywindex,lim); + strcat(querywindex,";"); + //printf("\n%s",querywindex); + + //Get aging windex entries + if (mysql_query(con,querywindex)) + { + finish_with_error(con); + } + + result = mysql_store_result(con); + + if(result == NULL) + { + finish_with_error(con); + } + + //get the number of fields (columns) in the table + //int num_fields = mysql_num_fields(result); + num_rows = mysql_num_rows(result); + + MYSQL_ROW row; + + while(row = mysql_fetch_row(result)){ + printf("----------------------------------------------------------\nRefresh:"); + + //Store data in first row into variables + char *id = row[0]; + char *url = row[1]; + char *worksafe = row[2]; + char *approver = row[3]; + char *surprise = row[4]; + char *updatable = row[5]; + char *crawl_tree = row[6]; + char *crawl_family = row[7]; + char *crawl_pages = row[8]; + char *crawl_type = row[9]; + char *crawl_repeat = row[10]; + char *force_rules = row[11]; + + char str_cr_count[100]; + memset(str_cr_count,0,100); + sprintf(str_cr_count,"%d",cr_count); + + printf("\nURL: %s\nID: %s\nWorksafe: %s\nSurprise: %s\nApprover: %s\nUpdatable: %s", url, id, worksafe, surprise, approver, updatable); + if(num_cr > 0){ + printf("\nCrawler ID: %d",cr_count); + }else{ + printf("\nCrawler ID: 1"); + } + + char sqlqueryinsertindexqueue[2000]; + memset(sqlqueryinsertindexqueue,0,2000); + strcpy(sqlqueryinsertindexqueue,"INSERT INTO indexqueue (url,worksafe,approver,surprise,updatable,crawl_tree,crawl_family,crawl_pages,crawl_type,crawl_repeat,force_rules,task,crawler_id) VALUES ('"); + strcat(sqlqueryinsertindexqueue,url);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,worksafe);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,approver);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,surprise);strcat(sqlqueryinsertindexqueue,"','"); + strcat(sqlqueryinsertindexqueue,updatable);strcat(sqlqueryinsertindexqueue,"',"); + if(crawl_tree != NULL){ + strcat(sqlqueryinsertindexqueue,"'");strcat(sqlqueryinsertindexqueue,crawl_tree);strcat(sqlqueryinsertindexqueue,"',"); + }else{ + strcat(sqlqueryinsertindexqueue,"NULL");strcat(sqlqueryinsertindexqueue,","); + } + if(crawl_family != NULL){ + strcat(sqlqueryinsertindexqueue,"'");strcat(sqlqueryinsertindexqueue,crawl_family);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"NULL");strcat(sqlqueryinsertindexqueue,",'"); + } + if(crawl_pages != NULL){ + strcat(sqlqueryinsertindexqueue,crawl_pages);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','"); + } + if(crawl_type != NULL){ + strcat(sqlqueryinsertindexqueue,crawl_type);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','"); + } + if(crawl_repeat != NULL){ + strcat(sqlqueryinsertindexqueue,crawl_repeat);strcat(sqlqueryinsertindexqueue,"','"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','"); + } + if(force_rules != NULL){ + strcat(sqlqueryinsertindexqueue,force_rules);strcat(sqlqueryinsertindexqueue,"','1"); + }else{ + strcat(sqlqueryinsertindexqueue,"0");strcat(sqlqueryinsertindexqueue,"','1"); + } + if(num_cr > 0){ + strcat(sqlqueryinsertindexqueue,"','");strcat(sqlqueryinsertindexqueue,str_cr_count); + }else{ + strcat(sqlqueryinsertindexqueue,"','1"); + } + strcat(sqlqueryinsertindexqueue,"');"); + + printf("\nInserting into indexqueue...\n"); + if(mysql_query(con,sqlqueryinsertindexqueue)) + { + finish_with_error(con); + } + + //Assign to crawlers in round robin fashion if user indicated more than one crawler. + if(cr_count < num_cr && num_cr > 0){ + cr_count++; + }else if(num_cr > 0){ + cr_count=1; + } + } + + //cleanup sql stuff + mysql_free_result(result); + mysql_close(con); + + if(num_rows > 0){ + printf("\nAwaiting next set of pages...\n\n"); + } + } + + sleep(5);//sleep 5 seconds + + if(num_rows==0 && re_rows == 0)//sleep if no rows were found + sleep(60);//sleep 60 seconds + } + + exit(0); +} + diff --git a/c/rt b/c/rt new file mode 100755 index 0000000..deb29ae Binary files /dev/null and b/c/rt differ diff --git a/c/rt.c b/c/rt.c new file mode 100755 index 0000000..be46c2e --- /dev/null +++ b/c/rt.c @@ -0,0 +1,345 @@ +//Wiby replication server tracker +//Admin creates file 'servers.csv' containing only IP and shard name (or mock shard names if not using the sharding method), one per line +//When executing, include the expected number of search results per page (eg: ./rt 12) so that a +//divisible list of available servers is allocated to the core application by the tracker. +//Tracker will check status of replica databases by attempting to connect to all listed every few seconds +//Tracker will create a copy of this file called 'res.csv' and display only the confirmed online servers +//as well as ID ranges divided across all online servers (accounting for deleted rows) so each has the same number of rows. + +#include </usr/include/mysql/mysql.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/time.h> + +FILE *servers; +FILE *error; +FILE *res; +int c,d; +char ip[1000][100]; +char db[1000][100]; +char ipOK[1000][100]; +char dbOK[1000][100]; +char startID[1000][100]; +char endID[1000][100]; +char firstOnlineServerIP[100]; +char firstOnlineServerDB[100]; +char *resfiletext; +char totalRows[50]; +char lastID[50]; +char strSQL[200]; + +struct timeval stop, start; + +void handle_error(MYSQL *con) +{ + error = fopen("rtlog", "a"); + printf("%s\n", mysql_error(con)); + fprintf(error, "%s\n", mysql_error(con)); + fclose(error); + mysql_close(con); +} +int isnum(char *source){ + int sourcelength = strlen(source); + for(int i=0;i < sourcelength; i++){ + if(source[i] < 48 || source[i] > 57){ + return 0; + } + } + return 1; +} + +int main(int argc, char **argv) +{ + int timetest=0,reportinit=0,running=0,pagelim=12; + if(argc == 2 && isnum(argv[1])==1){ + pagelim=atoi(argv[1]); + printf("\nStarting Replication Tracker:\n--------------------------------"); + }else{ + printf("\nStarting Replication Tracker:\n--------------------------------"); + printf("\n\nNo page limit was set, will use a default of 12.\n\nUsage: rt page_limit\n\nWhere page_limit is the expected number of search results per page."); + printf("\nMake sure you have setup servers.csv, see the install guide for more info."); + } + + printf("\n\nConnection Latency\n--------------------------------\n"); + + while(1) + { + long bytecount=0; + int serverCount=0, onlineServers=0, i=0, ipcnt=0, dbcnt=0, errcount=0, foundfirst=0,timeout=5,ignore = 0; + int ipORdb = 0; //0 = ip, 1 = space + servers = fopen("servers.csv", "rb"); + if (servers==NULL) + { + printf("Error opening 'servers.csv' file.\n"); + exit(0); + } + //parse server list + while((c = fgetc(servers)) != EOF) + { + if(c == 35)//check if line is commented out (#) + ignore = 1; + if(c != 10 && c != 13 && c != 32 && c != 44 && ipORdb == 0 && ignore == 0){//if no cr/lf, commas, spaces, or comments, gather ip + ip[serverCount][i] = c; + ipcnt++; + } + if(c==44 && ignore == 0){//if comma detected, switch to gather db name + ipORdb = 1; + i = -1; + } + if(c != 10 && c != 13 && c != 32 && c != 44 && ipORdb == 1 && ignore == 0){//if no cr/lf, commas, spaces, or comments, gather db + db[serverCount][i] = c; + dbcnt++; + } + if(c == 10){//count replication slaves + ipORdb = 0; + ip[serverCount][ipcnt] = 0;//null terminate string + db[serverCount][dbcnt] = 0; + if(ipcnt && dbcnt > 0) + serverCount++; + ipcnt = dbcnt = 0; + i = -1; + ignore = 0; + } + if(c != 13){ + i++; + bytecount++; + } + d=c; + } + if(i>0 && d != 10) + serverCount++; + fclose(servers); + + //Allocate bytes for the res file text +// resfiletext = (char*)calloc(bytecount+1000+(i*50),sizeof(char)); + char resfiletext[10000]; + memset(resfiletext,0,10000); + + //conect to each listed server and verify it works + int last=0; + for (i=0;i<serverCount;i++){ + int err = 0; + MYSQL *con = mysql_init(NULL); + if (con == NULL) + { + handle_error(con); + exit(0); + } + mysql_options(con,MYSQL_OPT_CONNECT_TIMEOUT,&timeout); + if(timetest==0){ + gettimeofday(&start, NULL); + } + if (mysql_real_connect(con, ip[i], "remote_guest", "d0gemuchw0w", "wiby", 0, NULL, 0) == NULL) + { + handle_error(con); + err=1; + } + if(timetest==0){ + gettimeofday(&stop, NULL); + printf("%s %s | %lums", ip[i], db[i], ((stop.tv_sec - start.tv_sec) * 1000000 + stop.tv_usec - start.tv_usec)/1000); + if(err==1) + printf(" (Fail)"); + printf("\n"); + } + if(err==0){//append successful connection info to res string + strcpy(ipOK[onlineServers],ip[i]); + strcpy(dbOK[onlineServers],db[i]); + last=i; + onlineServers++; + mysql_close(con); + } + } + timetest=1; + + //get more database info needed for distributed queries + //-------------------------------------------------------------------------------------------------------------------- + + //calculate how many servers can be used, which must be divisible to or by the search results per page limit. + int coreAssigned=onlineServers; + if(pagelim == onlineServers || onlineServers==0){ + //do nothing + }else if(pagelim>onlineServers){ + //compute number of servers to harness + while(pagelim % coreAssigned != 0){ + coreAssigned--; + } + }else if(pagelim<onlineServers){ + //compute number of servers to harness + while(coreAssigned % pagelim != 0){ + coreAssigned--; + } + } + + // connect to last available slave server and get info needed for all available (coreAssigned) slaves to handle a distributed query + int initialinfo = 0, nRows=0; + for (i=0;i<coreAssigned;i++){ + int err = 0, startIDint=0; + long long int numrows=0; + MYSQL *con = mysql_init(NULL); + if (con == NULL) + { + handle_error(con); + exit(0); + } + mysql_options(con,MYSQL_OPT_CONNECT_TIMEOUT,&timeout); + if (mysql_real_connect(con, ipOK[last], "remote_guest", "d0gemuchw0w", "wiby", 0, NULL, 0) == NULL) //connect to the last online server each iteration + { + handle_error(con); + err=1; + } + if(err==0){ + if(i==0){//get initial info + + //Get total number of rows + if (mysql_query(con, "SELECT COUNT(id) FROM windex;")) + { + handle_error(con); + } + MYSQL_RES *result = mysql_store_result(con); + if(result == NULL) + { + handle_error(con); + exit(0); + } + MYSQL_ROW row = mysql_fetch_row(result); + nRows = atoi(row[0]); + + //free old result data or else you'll get a memory leak + mysql_free_result(result); + + //Get the last row id number + if (mysql_query(con, "SELECT id FROM windex ORDER BY id DESC LIMIT 1;")) + { + handle_error(con); + } + result = mysql_store_result(con); + if(result == NULL) + { + handle_error(con); + exit(0); + } + row = mysql_fetch_row(result); + memset(lastID, 0, 50); + strcpy(lastID,row[0]); + + //free old result data or else you'll get a memory leak + mysql_free_result(result); + + if(reportinit==0) + printf("\nCurrent ID Ranges (Rows: %d)\n--------------------------------",nRows); + } + + //Get id of last row of the % of the db you want to search (depending on # of slaves) + numrows = (nRows / coreAssigned * i) + (nRows / coreAssigned) - 1; + //printf("\n%lld",numrows);fflush(stdout); + sprintf(totalRows, "%lld", numrows);//convert int to string + strcpy(strSQL,"SELECT id FROM windex ORDER BY id LIMIT "); + strcat(strSQL,totalRows); + strcat(strSQL,",1;"); + //SELECT id FROM windex ORDER BY id LIMIT n-1,1; + if (mysql_query(con, strSQL)) + { + handle_error(con); + } + MYSQL_RES *result2 = mysql_store_result(con); + if(result2 == NULL) + { + handle_error(con); + exit(0); + } + MYSQL_ROW row = mysql_fetch_row(result2); + + //store endID and startID + if(i+1 != coreAssigned) + strcpy(endID[i],row[0]); + else + strcpy(endID[i],lastID); + //strcpy(endID[i],row[0]); + + if(i==0){ + strcpy(startID[i],"0"); + }else{ + startIDint = atoi(endID[i-1])+1; + sprintf(startID[i], "%d", startIDint); + } + if(reportinit==0){ + printf("\n%s %s | %s %s",ipOK[i],dbOK[i],startID[i],endID[i]); + if(i+1 == coreAssigned) + printf("\n\n"); + fflush(stdout); + } + + //free old result data or else you'll get a memory leak + mysql_free_result(result2); + mysql_close(con); + + //update res file + if(i>0) + strcat(resfiletext,"\n"); + strcat(resfiletext,ipOK[i]); + strcat(resfiletext,","); + strcat(resfiletext,dbOK[i]); + strcat(resfiletext,","); + strcat(resfiletext,startID[i]); + strcat(resfiletext,","); + strcat(resfiletext,endID[i]); + } + } + //-------------------------------------------------------------------------------------------------------------------- + + //get resfiletext length + long resfiletextlen = strlen(resfiletext); + res = fopen("res.csv","rb"); + if (res==NULL) + { + printf("Error opening 'res.csv' file. Will create a new one.\n"); + res = fopen("res","w+"); + if (res==NULL) + { + printf("Error creating 'res.csv' file.\n"); + exit(0); + } + } + //Get file size + fseek(res, 0L, SEEK_END); + bytecount = ftell(res); + rewind(res); + + //check if res file is different from resfiletext string. + i=0; + int changed=0; + if(bytecount == resfiletextlen){ + while((c = fgetc(res)) != EOF) + { + if(c != resfiletext[i]){ + changed = 1; + } + i++; + } + fclose(res); + }else{ + changed = 1; + } + + reportinit = 1; + //store available servers in res file + if(changed == 1){ + res = fopen("res.csv", "w"); + fprintf(res, "%s", resfiletext); + fclose(res); + reportinit = 0; + } + if(running == 0){ + printf("Running\n"); + fflush(stdout); + running = 1; + } + + //fflush(stdout); + //free(resfiletext); + sleep(5); + } +} + diff --git a/c/servers.csv b/c/servers.csv new file mode 100644 index 0000000..e5434d6 --- /dev/null +++ b/c/servers.csv @@ -0,0 +1,4 @@ +localhost,ws0 +localhost,ws1 +localhost,ws2 +localhost,ws3 diff --git a/c/shards b/c/shards new file mode 100644 index 0000000..b8626c4 --- /dev/null +++ b/c/shards @@ -0,0 +1 @@ +4 diff --git a/c/urlparse.h b/c/urlparse.h new file mode 100755 index 0000000..75f189d --- /dev/null +++ b/c/urlparse.h @@ -0,0 +1,300 @@ +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +//char url[] = "index.htm\0"; +char urlcopy[1000]; +char domain[1000]; +char tldlist[] = "co.uk,org.uk,co.jp\0"; +char buffer[1000]; +char rootdomain[1000]; +char urlPath[1000]; +char folderPath[1000]; +char urlnopathnoprefix_fromlist[1000]; +char urlnoprefix_fromlist[10000]; +char prefix_fromlist[14]; +int prefixsize_fromlist=0; +int checkDomain(char *domain, char *substrLower, char *substrUpper, int domainLen, int substrLen); + +void urlparse(char* url){ +//int main(int argc, char *argv[]) { + int foundDot=0,foundDotInPath=0,foundSlash=0,foundColon=0,slashPos=0,lastSlashPos=0,folderPathLength=0,isFile=0,pathlen=0; + int rootdomaincount=0; + int isIPv4=1,isIPv6=1; + memset(buffer,0,1000); + memset(urlcopy,0,1000); + memset(domain,0,1000); + memset(rootdomain,0,1000); + memset(urlPath,0,1000); + memset(folderPath,0,1000); + memset(urlnoprefix_fromlist,0,1000); + memset(urlnopathnoprefix_fromlist,0,1000); + + //find out if its http or https or http://www. or https://www. + int httpwww=0, httpswww=0, http=0, https=0; + //char prefix[12]; + memset(prefix_fromlist,0,14); + strcpy(prefix_fromlist,"http"); + int urlsize = strlen(url); + + if(urlsize<998){ + + //copy url (variable from crawler) + strcpy(urlcopy,url); + + //truncate any "index.html" files and just use the directory path + if(urlsize == 10){ + if(checkDomain(urlcopy,"index.html","INDEX.HTML",urlsize,10)==1){ + urlcopy[0]=0; + urlsize=0; + } + }/*else if(urlsize == 9){ + if(checkDomain(urlcopy,"index.htm","INDEX.HTM",urlsize,9)==1){ + urlcopy[0]=0; + urlsize=0; + } + }*/ + if(urlsize > 10){ + if(checkDomain(urlcopy,"/index.html","/INDEX.HTML",urlsize,11)==1){ + urlcopy[urlsize-10]=0; + urlsize-=10; + } + } + /*if(urlsize > 9){ + if(checkDomain(urlcopy,"/index.htm","/INDEX.HTM",urlsize,10)==1){ + urlcopy[urlsize-9]=0; + urlsize-=9; + } + }*/ + + if(urlsize > 4){ + if(url[4]==':' && (url[3]=='p' || url[3]=='P')) + http = 7; + } + if(urlsize > 5){ + if(url[5]==':' && (url[4]=='s' || url[4]=='S')) + https = 8; + } + if(urlsize > 11){ + if((url[7]=='w' || url[7]=='W') && (url[8]=='w' || url[8]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[10]=='.' ){ + httpwww = 11; + http = https = 0; + } + if(url[7]=='/' && (url[8]=='w' || url[8]=='W') && (url[9]=='w' || url[9]=='W') && ((url[9]=='w' || url[9]=='W') || url[9]=='1' || url[9]=='2' || url[9]=='3') && url[11]=='.' ){ + httpswww = 12; + http = https = 0; + } + } + + //set the prefix + if(http > 0) strcat(prefix_fromlist,"://"); + else if(https > 0) strcat(prefix_fromlist,"s://"); + else if(httpwww > 0) strcat(prefix_fromlist,"://www."); + else if(httpswww > 0) strcat(prefix_fromlist,"s://www."); + + int prefixsize_fromlist = httpswww+httpwww+https+http; + //char urlnoprefix[urlsize-prefixsize+1]; + //memset(urlnoprefix,0,urlsize-prefixsize+1); + + int urlcount=0,urlnoprefixcount=0,urlnopathnoprefix_done=0,urlnopathnoprefix_len=0; + + //if no prefix, see if it might be a domain + int noprebutisdomain=0; + if(prefixsize_fromlist==0){ + memset(prefix_fromlist,0,14); + while(urlcount < urlsize+1) + { + if(urlcopy[urlcount]=='.' && urlcount>0) + { + noprebutisdomain=1; + break; + } + if(urlcopy[urlcount]=='/') + { + noprebutisdomain=0; + break; + } + urlcount++; + } + } + + //store the url without prefix to urlnoprefix + urlcount=0; + if(prefixsize_fromlist!=0 || noprebutisdomain==1){ + while(urlcount < urlsize) + { + if(urlcount>prefixsize_fromlist-1) + { + urlnoprefix_fromlist[urlnoprefixcount]=urlcopy[urlcount]; + + //get urlnopath + if(urlcopy[urlcount] != '/' && urlnopathnoprefix_done==0){ + urlnopathnoprefix_fromlist[urlnoprefixcount]=urlcopy[urlcount]; + urlnopathnoprefix_len++; + }else{ + urlnopathnoprefix_done=1; + } + urlnoprefixcount++; + } + urlcount++; + } + } + + //check for file extension like html/htm/txt if no prefix in url + if(noprebutisdomain==1 && strlen(urlnopathnoprefix_fromlist)>4){ + if(checkDomain(urlnopathnoprefix_fromlist,".html",".HTML",urlnopathnoprefix_len,5)==1 || checkDomain(urlnopathnoprefix_fromlist,".htm",".HTM",urlnopathnoprefix_len,4)==1 || checkDomain(urlnopathnoprefix_fromlist,".txt",".txt",urlnopathnoprefix_len,4)==1 || checkDomain(urlnopathnoprefix_fromlist,".php",".PHP",urlnopathnoprefix_len,4)==1 || checkDomain(urlnopathnoprefix_fromlist,".shtml",".SHTML",urlnopathnoprefix_len,6)==1 || checkDomain(urlnopathnoprefix_fromlist,".xhtml",".XHTML",urlnopathnoprefix_len,6)==1 || checkDomain(urlnopathnoprefix_fromlist,".cgi",".CGI",urlnopathnoprefix_len,4)==1){ + memset(domain,0,1000); + memset(urlnoprefix_fromlist,0,1000); + memset(urlnopathnoprefix_fromlist,0,1000); + urlnoprefixcount=0; + } + } + + //get domain name + int lenurl=strlen(urlnoprefix_fromlist); + int numDots=0; + int i=0; + for(i;i<lenurl;i++){ + + //to get folder path, locate final slash position + if(urlnoprefix_fromlist[i]=='/') + lastSlashPos=i; + + //Null terminate hostname at first slash + if(urlnoprefix_fromlist[i]!='/') + domain[i]=urlnoprefix_fromlist[i]; + if(urlnoprefix_fromlist[i]=='.' && foundSlash==0) + numDots++; + + //get path after hostname + if(urlnoprefix_fromlist[i]=='/' && foundSlash==0){ + foundSlash=1; + slashPos=i-1; + pathlen++; + } + if(foundSlash==1){ + urlPath[i-slashPos-1]=urlnoprefix_fromlist[i]; + pathlen++; + if(urlnoprefix_fromlist[i]=='.') + foundDotInPath=1; + } + + if(urlnoprefix_fromlist[i]==':') + foundColon=1; + + //Check if hostname is an IPv4 address + if(((urlnoprefix_fromlist[i]<48 && urlnoprefix_fromlist[i] != '.') || (urlnoprefix_fromlist[i]>57)) && foundSlash==0) + isIPv4=0; + //Check if hostname is an IPv6 address + if(((urlnoprefix_fromlist[i]<48 && urlnoprefix_fromlist[i] > 57) || (urlnoprefix_fromlist[i]<65 && urlnoprefix_fromlist[i]>70) || (urlnoprefix_fromlist[i]<97 && urlnoprefix_fromlist[i]>102)) && foundSlash==0) + isIPv6=0; + } + + if(foundColon==0) + isIPv6=0; + + if(isIPv6==1)//if ipv6, force it into working + numDots=1; + + if(foundDotInPath==0 && pathlen>1){ + //urlPath[pathlen-1]='/'; + //pathlen++; + //urlnoprefix[lenurl]='/'; + //lenurl++; + lastSlashPos=lenurl; + } + + + //get folder path + folderPathLength=lastSlashPos-slashPos; + for(i=0;i<folderPathLength;i++){ + folderPath[i]=urlnoprefix_fromlist[i+slashPos+1]; + } + if(numDots==0 && isIPv6==0){ + memset(urlPath,0,1000); + memset(folderPath,0,1000); + strcpy(urlPath,urlnoprefix_fromlist); + strcpy(folderPath,urlnoprefix_fromlist); + } + + if(folderPathLength>2 && folderPath[i-2] != 0 && folderPath[i-2] != '/') + folderPath[i-1]='/'; + + if(urlPath[0]==0) + urlPath[0]='/'; + if(folderPath[0]==0) + folderPath[0]='/'; + + int lendomain=strlen(domain); + //get tld + int lentldlist=strlen(tldlist); + int foundDoubleDotTLD=0, k=0, dotcount=0, firstSlash=0; + for(i=0;i<=lentldlist;i++){ + if(tldlist[i] != ',' && tldlist[i] != 0){ + buffer[k]=tldlist[i]; + k++; + }else if(foundDoubleDotTLD==0 && (tldlist[i] == ',' || tldlist[i] == 0)){ + if(strstr(urlnoprefix_fromlist,buffer)!=NULL) + foundDoubleDotTLD=1; + if(numDots <=2 && foundDoubleDotTLD==1) + strcpy(rootdomain,domain); + if(numDots > 2 && foundDoubleDotTLD==1){ + int j=0; + for(j;j<lenurl;j++){ + if(foundDot==1){ + if(urlnoprefix_fromlist[j]=='/') + firstSlash=1; + if(firstSlash==0){ + rootdomain[rootdomaincount]=urlnoprefix_fromlist[j]; + rootdomaincount++; + } + } + if(urlnoprefix_fromlist[j]=='.') + foundDot=1; + } + } + if (tldlist[i] == ','){ + memset(buffer,0,1000); + k=0; + } + }else if(foundDoubleDotTLD==1){ + break; + } + } + + if(foundDoubleDotTLD==0){ + foundDot=rootdomaincount=0; + if(numDots==1){ + strcpy(rootdomain,domain); + }else if(numDots>1){ + //skip text before first dot + for(i=0;i<lendomain;i++){ + if(foundDot==1 || isIPv4==1){ + rootdomain[rootdomaincount]=domain[i]; + rootdomaincount++; + } + if(domain[i]=='.') + foundDot=1; + } + } + } + +// printf("\nURL: %s\nHostname: %s\nPath: %s\nURL nopathnopre: %s\nFolder Path: %s\nURL_noprefix: %s\nPrefix: %s\nPrefix Size: %d",url,rootdomain,urlPath,urlnopathnoprefix_fromlist,folderPath,urlnoprefix_fromlist,prefix_fromlist,prefixsize_fromlist); + } +// return 0; +} + +int checkDomain(char *domain, char *substrLower, char *substrUpper, int domainLen, int substrLen){ + int j=0; + if(domainLen>=substrLen){ + for(int i=domainLen-substrLen;i<domainLen;i++){ + if(domain[i]!=substrLower[j] && domain[i]!=substrUpper[j]){ + return 0; + } + j++; + } + return 1; + }else{ + return 0; + } +} diff --git a/db/wiby.sql b/db/wiby.sql new file mode 100755 index 0000000..6e9f6dd --- /dev/null +++ b/db/wiby.sql @@ -0,0 +1,426 @@ +-- MySQL dump 10.13 Distrib 8.0.18, for Linux (x86_64) +-- +-- Host: localhost Database: wiby +-- ------------------------------------------------------ +-- Server version 8.0.18 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!50503 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `accounts` +-- + +DROP TABLE IF EXISTS `accounts`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `accounts` ( + `name` varchar(50) CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci NOT NULL, + `hash` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `level` mediumtext CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `attempts` int(11) DEFAULT '0', + `updated` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (`name`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `accounts` +-- + +LOCK TABLES `accounts` WRITE; +/*!40000 ALTER TABLE `accounts` DISABLE KEYS */; +/*!40000 ALTER TABLE `accounts` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `feedback` +-- + +DROP TABLE IF EXISTS `feedback`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `feedback` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `message` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `feedback` +-- + +LOCK TABLES `feedback` WRITE; +/*!40000 ALTER TABLE `feedback` DISABLE KEYS */; +/*!40000 ALTER TABLE `feedback` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `graveyard` +-- + +DROP TABLE IF EXISTS `graveyard`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `graveyard` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text, + `worksafe` tinyint(1) DEFAULT NULL, + `reserved` text, + `reservetime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `graveyard` +-- + +LOCK TABLES `graveyard` WRITE; +/*!40000 ALTER TABLE `graveyard` DISABLE KEYS */; +/*!40000 ALTER TABLE `graveyard` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `indexqueue` +-- + +DROP TABLE IF EXISTS `indexqueue`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `indexqueue` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `worksafe` tinyint(1) DEFAULT NULL, + `approver` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `surprise` tinyint(1) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `task` tinyint(4) DEFAULT NULL, + `crawl_tree` text, + `crawl_family` text, + `crawl_depth` int(11) DEFAULT NULL, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(4) DEFAULT NULL, + `force_rules` tinyint(1) DEFAULT NULL, + `crawler_id` int(11) DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `indexqueue` +-- + +LOCK TABLES `indexqueue` WRITE; +/*!40000 ALTER TABLE `indexqueue` DISABLE KEYS */; +/*!40000 ALTER TABLE `indexqueue` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `reviewqueue` +-- + +DROP TABLE IF EXISTS `reviewqueue`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reviewqueue` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` mediumtext, + `worksafe` tinyint(1) DEFAULT NULL, + `reserved` mediumtext, + `reservetime` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `time` datetime DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `reviewqueue` +-- + +LOCK TABLES `reviewqueue` WRITE; +/*!40000 ALTER TABLE `reviewqueue` DISABLE KEYS */; +/*!40000 ALTER TABLE `reviewqueue` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `windex` +-- + +DROP TABLE IF EXISTS `windex`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `windex` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `url_noprefix` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `tags` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `description` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `body` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `language` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `surprise` tinyint(1) DEFAULT NULL, + `http` tinyint(1) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `worksafe` tinyint(1) DEFAULT NULL, + `crawl_tree` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_family` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(1) DEFAULT NULL, + `force_rules` tinyint(1) DEFAULT NULL, + `enable` tinyint(1) DEFAULT NULL, + `date` datetime NOT NULL DEFAULT '0000-00-00 00:00:00', + `updated` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `approver` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `fault` tinyint(1) DEFAULT '0', + `shard` int(11) DEFAULT '0', + PRIMARY KEY (`id`), + FULLTEXT KEY `title` (`title`), + FULLTEXT KEY `url` (`url`), + FULLTEXT KEY `url_noprefix` (`url_noprefix`), + FULLTEXT KEY `main` (`tags`,`title`,`body`,`description`,`url`), + FULLTEXT KEY `tags` (`tags`), + FULLTEXT KEY `description` (`description`), + FULLTEXT KEY `body` (`body`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `windex` +-- + +LOCK TABLES `windex` WRITE; +/*!40000 ALTER TABLE `windex` DISABLE KEYS */; +/*!40000 ALTER TABLE `windex` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `ws0` +-- + +DROP TABLE IF EXISTS `ws0`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `ws0` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `url_noprefix` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `tags` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `description` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `body` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `language` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `surprise` tinyint(4) DEFAULT NULL, + `http` tinyint(4) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `worksafe` tinyint(4) DEFAULT NULL, + `crawl_tree` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_family` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(4) DEFAULT NULL, + `force_rules` tinyint(4) DEFAULT NULL, + `enable` tinyint(4) DEFAULT NULL, + `date` datetime NOT NULL DEFAULT '0000-00-00 00:00:00', + `updated` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `approver` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `fault` tinyint(4) DEFAULT '0', + `shard` int(11) DEFAULT '0', + PRIMARY KEY (`id`), + FULLTEXT KEY `main` (`tags`,`title`,`body`,`description`,`url`), + FULLTEXT KEY `title` (`title`), + FULLTEXT KEY `url` (`url`), + FULLTEXT KEY `url_noprefix` (`url_noprefix`), + FULLTEXT KEY `tags` (`tags`), + FULLTEXT KEY `description` (`description`), + FULLTEXT KEY `body` (`body`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `ws0` +-- + +LOCK TABLES `ws0` WRITE; +/*!40000 ALTER TABLE `ws0` DISABLE KEYS */; +/*!40000 ALTER TABLE `ws0` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `ws1` +-- + +DROP TABLE IF EXISTS `ws1`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `ws1` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `url_noprefix` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `tags` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `description` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `body` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `language` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `surprise` tinyint(4) DEFAULT NULL, + `http` tinyint(4) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `worksafe` tinyint(4) DEFAULT NULL, + `crawl_tree` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_family` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(4) DEFAULT NULL, + `force_rules` tinyint(4) DEFAULT NULL, + `enable` tinyint(4) DEFAULT NULL, + `date` datetime NOT NULL DEFAULT '0000-00-00 00:00:00', + `updated` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `approver` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `fault` tinyint(4) DEFAULT '0', + `shard` int(11) DEFAULT '0', + PRIMARY KEY (`id`), + FULLTEXT KEY `main` (`tags`,`title`,`body`,`description`,`url`), + FULLTEXT KEY `title` (`title`), + FULLTEXT KEY `url` (`url`), + FULLTEXT KEY `url_noprefix` (`url_noprefix`), + FULLTEXT KEY `tags` (`tags`), + FULLTEXT KEY `description` (`description`), + FULLTEXT KEY `body` (`body`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `ws1` +-- + +LOCK TABLES `ws1` WRITE; +/*!40000 ALTER TABLE `ws1` DISABLE KEYS */; +/*!40000 ALTER TABLE `ws1` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `ws2` +-- + +DROP TABLE IF EXISTS `ws2`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `ws2` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `url_noprefix` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `tags` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `description` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `body` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `language` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `surprise` tinyint(4) DEFAULT NULL, + `http` tinyint(4) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `worksafe` tinyint(4) DEFAULT NULL, + `crawl_tree` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_family` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(4) DEFAULT NULL, + `force_rules` tinyint(4) DEFAULT NULL, + `enable` tinyint(4) DEFAULT NULL, + `date` datetime NOT NULL DEFAULT '0000-00-00 00:00:00', + `updated` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `approver` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `fault` tinyint(4) DEFAULT '0', + `shard` int(11) DEFAULT '0', + PRIMARY KEY (`id`), + FULLTEXT KEY `main` (`tags`,`title`,`body`,`description`,`url`), + FULLTEXT KEY `title` (`title`), + FULLTEXT KEY `url` (`url`), + FULLTEXT KEY `url_noprefix` (`url_noprefix`), + FULLTEXT KEY `tags` (`tags`), + FULLTEXT KEY `description` (`description`), + FULLTEXT KEY `body` (`body`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `ws2` +-- + +LOCK TABLES `ws2` WRITE; +/*!40000 ALTER TABLE `ws2` DISABLE KEYS */; +/*!40000 ALTER TABLE `ws2` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `ws3` +-- + +DROP TABLE IF EXISTS `ws3`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `ws3` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `url_noprefix` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `tags` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `description` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `body` longtext CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `language` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `surprise` tinyint(4) DEFAULT NULL, + `http` tinyint(4) DEFAULT NULL, + `updatable` int(11) DEFAULT '1', + `worksafe` tinyint(4) DEFAULT NULL, + `crawl_tree` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_family` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `crawl_pages` int(11) DEFAULT NULL, + `crawl_type` int(11) DEFAULT NULL, + `crawl_repeat` tinyint(4) DEFAULT NULL, + `force_rules` tinyint(4) DEFAULT NULL, + `enable` tinyint(4) DEFAULT NULL, + `date` datetime NOT NULL DEFAULT '0000-00-00 00:00:00', + `updated` datetime DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + `approver` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `fault` tinyint(4) DEFAULT '0', + `shard` int(11) DEFAULT '0', + PRIMARY KEY (`id`), + FULLTEXT KEY `main` (`tags`,`title`,`body`,`description`,`url`), + FULLTEXT KEY `title` (`title`), + FULLTEXT KEY `url` (`url`), + FULLTEXT KEY `url_noprefix` (`url_noprefix`), + FULLTEXT KEY `tags` (`tags`), + FULLTEXT KEY `description` (`description`), + FULLTEXT KEY `body` (`body`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `ws3` +-- + +LOCK TABLES `ws3` WRITE; +/*!40000 ALTER TABLE `ws3` DISABLE KEYS */; +/*!40000 ALTER TABLE `ws3` ENABLE KEYS */; +UNLOCK TABLES; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2023-08-09 23:41:07 diff --git a/db/wibytemp.sql b/db/wibytemp.sql new file mode 100755 index 0000000..ab6752e --- /dev/null +++ b/db/wibytemp.sql @@ -0,0 +1,126 @@ +-- MySQL dump 10.13 Distrib 8.0.18, for Linux (x86_64) +-- +-- Host: localhost Database: wibytemp +-- ------------------------------------------------------ +-- Server version 8.0.18 + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!50503 SET NAMES utf8mb4 */; +/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; +/*!40103 SET TIME_ZONE='+00:00' */; +/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; +/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; +/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; +/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + +-- +-- Table structure for table `crawled` +-- + +DROP TABLE IF EXISTS `crawled`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `crawled` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url_noprefix` text, + `time` datetime DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`), + FULLTEXT KEY `url_noprefix` (`url_noprefix`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `crawled` +-- + +LOCK TABLES `crawled` WRITE; +/*!40000 ALTER TABLE `crawled` DISABLE KEYS */; +/*!40000 ALTER TABLE `crawled` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `rejected` +-- + +DROP TABLE IF EXISTS `rejected`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `rejected` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `user` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `type` int(11) DEFAULT NULL, + `date` datetime DEFAULT NULL, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `rejected` +-- + +LOCK TABLES `rejected` WRITE; +/*!40000 ALTER TABLE `rejected` DISABLE KEYS */; +/*!40000 ALTER TABLE `rejected` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `reserve_id` +-- + +DROP TABLE IF EXISTS `reserve_id`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `reserve_id` ( + `id` bigint(20) NOT NULL, + `crawler_id` int(11) DEFAULT NULL, + `time` datetime DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `reserve_id` +-- + +LOCK TABLES `reserve_id` WRITE; +/*!40000 ALTER TABLE `reserve_id` DISABLE KEYS */; +/*!40000 ALTER TABLE `reserve_id` ENABLE KEYS */; +UNLOCK TABLES; + +-- +-- Table structure for table `titlecheck` +-- + +DROP TABLE IF EXISTS `titlecheck`; +/*!40101 SET @saved_cs_client = @@character_set_client */; +/*!50503 SET character_set_client = utf8mb4 */; +CREATE TABLE `titlecheck` ( + `id` bigint(20) NOT NULL AUTO_INCREMENT, + `url` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + `title` text CHARACTER SET utf8mb4 COLLATE utf8mb4_unicode_520_ci, + PRIMARY KEY (`id`) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_520_ci; +/*!40101 SET character_set_client = @saved_cs_client */; + +-- +-- Dumping data for table `titlecheck` +-- + +LOCK TABLES `titlecheck` WRITE; +/*!40000 ALTER TABLE `titlecheck` DISABLE KEYS */; +/*!40000 ALTER TABLE `titlecheck` ENABLE KEYS */; +UNLOCK TABLES; +/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + +/*!40101 SET SQL_MODE=@OLD_SQL_MODE */; +/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; +/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; +/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + +-- Dump completed on 2023-08-05 23:15:21 diff --git a/etc/nginx/sites-available/default_example b/etc/nginx/sites-available/default_example new file mode 100755 index 0000000..66b79b7 --- /dev/null +++ b/etc/nginx/sites-available/default_example @@ -0,0 +1,270 @@ +## +# You should look at the following URL's in order to grasp a solid understanding +# of Nginx configuration files in order to fully unleash the power of Nginx. +# https://www.nginx.com/resources/wiki/start/ +# https://www.nginx.com/resources/wiki/start/topics/tutorials/config_pitfalls/ +# https://wiki.debian.org/Nginx/DirectoryStructure +# +# In most cases, administrators will remove this file from sites-enabled/ and +# leave it as reference inside of sites-available where it will continue to be +# updated by the nginx packaging team. +# +# This file will automatically load configuration files provided by other +# applications, such as Drupal or Wordpress. These applications will be made +# available underneath a path with that package name, such as /drupal8. +# +# Please see /usr/share/doc/nginx-doc/examples/ for more detailed examples. +## + +# Default server configuration +# + +fastcgi_cache_path /etc/nginx/phpcache levels=1:2 max_size=1g keys_zone=MYAPP:100m inactive=5m; +fastcgi_cache_key "$scheme$request_method$host$request_uri"; + +proxy_cache_path /etc/nginx/cache levels=1:2 keys_zone=main_cache:100m max_size=1g inactive=5m; +proxy_cache_key "$scheme$request_method$host$request_uri$cookie_ws"; + +#server { #redirect http to https +# listen 80 default_server; +# listen [::]:80 default_server ipv6only=on; +# server_name wiby.me; +# return 301 https://$host$request_uri; +#} +upstream remote_core { +# server 10.8.0.101:8080; +# server 10.8.0.102:8080; +# server 10.8.0.103:8080; +# server 10.8.0.104:8080; +# server 127.0.0.1:8080 backup; + server 127.0.0.1:8080; +} +server { #handles http requests. Allows for legacy browsers or else redirects to https + listen 80 default_server; +# listen [::]:80 default_server ipv6only=off; #this prevented nginx from starting on my vps, said port was in use + server_name _ ; + + if ( $http_user_agent ~ (Chrome)) { #redirect to https for old chrome devices + return 301 https://$host$request_uri; + } + + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.php index.html index.htm; + + #comment all "core app" location entries to revert wiby search to php + + location = / { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/; + proxy_pass http://remote_core/; + } + location /settings/ { #core app + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/settings/; + #proxy_pass http://remote_core/settings/; + } + location = /json/ { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/json/; + proxy_pass http://remote_core/json/; + } + location = /surprise/ { #core app + # try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/surprise/; + #proxy_pass http://remote_core/surprise/; + } + + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + location ~ \.php$ { + # try_files $url = 404; + fastcgi_split_path_info ^(.+\.php)(/.+)$; + include snippets/fastcgi-php.conf; + include fastcgi_params; + # + # # With php-fpm (or other unix sockets): + fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; + fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # fastcgi_index index.php; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + # #for microcaching + fastcgi_cache MYAPP; + fastcgi_cache_valid 5m; + fastcgi_cache_bypass $no_cache; + fastcgi_no_cache $no_cache; + } + + # deny access to .htaccess files, if Apache's document root + + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} + + # Don't cache the following URLs + if ($request_uri ~* "/(review/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|inndexqueue.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|review.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|error.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(insert/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|error.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(surprise/|index.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(submit/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|submit.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|error.html.php)") { set $no_cache 1; } +} + +server { + #listen 80 default_server; #comment this out if you dont want http + #listen [::]:80 default_server; #comment this out if you dont want http + + # SSL configuration + # + listen 443 ssl default_server; + #listen [::]:443 ssl default_server; #nginx wasnt starting on my vps with this + + ssl_certificate /etc/nginx/ssl/YOUR_SSL_CERT.crt; + ssl_certificate_key /etc/nginx/ssl/YOUR_SSL_KEY.key; + # + # Note: You should disable gzip for SSL traffic. + # See: https://bugs.debian.org/773332 + # + # Read up on ssl_ciphers to ensure a secure configuration. + # See: https://bugs.debian.org/765782 + # + # Self signed certs generated by the ssl-cert package + # Don't use them in a production server! + # + # include snippets/snakeoil.conf; + + root /var/www/html; + + # Add index.php to the list if you are using PHP + index index.php index.html index.htm; + + server_name none; + + #comment all "core app" location entries to revert wiby search to php + + location = / { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/; + proxy_pass http://remote_core/; + } + location /settings/ { #core app + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/settings/; + #proxy_pass http://remote_core/settings/; + } + location = /json/ { #core app + proxy_cache main_cache; + proxy_cache_valid 5m; + proxy_cache_bypass $no_cache; + proxy_no_cache $no_cache; + try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + #proxy_pass http://127.0.0.1:8080/json/; + proxy_pass http://remote_core/json/; + } + location = /surprise/ { #core app + # try_files $uri $uri/ =404; + proxy_set_header X-Real-IP $remote_addr; + proxy_pass http://127.0.0.1:8080/surprise/; + #proxy_pass http://remote_core/surprise/; + } + + location / { + # First attempt to serve request as file, then + # as directory, then fall back to displaying a 404. + try_files $uri $uri/ =404; + } + + # pass PHP scripts to FastCGI server + # + location ~ \.php$ { + # try_files $url = 404; + fastcgi_split_path_info ^(.+\.php)(/.+)$; + include snippets/fastcgi-php.conf; + include fastcgi_params; + # + # # With php-fpm (or other unix sockets): + fastcgi_param SCRIPT_FILENAME $document_root$fastcgi_script_name; + fastcgi_pass unix:/var/run/php/php7.4-fpm.sock; + # fastcgi_index index.php; + # # With php-cgi (or other tcp sockets): + # fastcgi_pass 127.0.0.1:9000; + # #for microcaching + fastcgi_cache MYAPP; + fastcgi_cache_valid 5m; + fastcgi_cache_bypass $no_cache; + fastcgi_no_cache $no_cache; + } + + + # deny access to .htaccess files, if Apache's document root + + # concurs with nginx's one + # + #location ~ /\.ht { + # deny all; + #} + + # Don't cache the following URLs + if ($request_uri ~* "/(review/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|inndexqueue.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|review.php)") { set $no_cache 1; } + if ($request_uri ~* "/(review/|error.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(insert/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|login.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|error.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(insert/|insert.html.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(surprise/|index.php)") { set $no_cache 1; } + + if ($request_uri ~* "/(submit/|index.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|form.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|submit.html.php)") { set $no_cache 1; } + if ($request_uri ~* "/(submit/|error.html.php)") { set $no_cache 1; } +} diff --git a/go/core/1core b/go/core/1core new file mode 100755 index 0000000..ec1f352 Binary files /dev/null and b/go/core/1core differ diff --git a/go/core/1core.go b/go/core/1core.go new file mode 100755 index 0000000..534d46a --- /dev/null +++ b/go/core/1core.go @@ -0,0 +1,1054 @@ +package main + +import ( + "database/sql" + _ "github.com/go-sql-driver/mysql" +// "fmt" + "html" + "html/template" + "log" + "net/http" + "net/url" + "strconv" + "strings" + "unicode/utf8" + // "time" +) + +type indexPage struct{} +type errorReport struct{ Error string } +type surpriseURL struct{ Url string } +type settingsPage struct{ Worksafe, FilterHTTPS bool } +type MySQLResults struct{ Id, Url, Title, Description, Body string } +type PageData struct { + DBResults []MySQLResults + Query, Page string + FindMore bool +} + +func main() { + http.HandleFunc("/", handler) + http.HandleFunc("/json", handler) + http.HandleFunc("/json/", handler) + http.HandleFunc("/surprise", surprise) + http.HandleFunc("/surprise/", surprise) + http.HandleFunc("/settings/", settings) + http.HandleFunc("/settings", settings) + log.Fatal(http.ListenAndServe("localhost:8080", nil)) +} + +//https://golang.org/pkg/net/http/#Request +func handler(w http.ResponseWriter, r *http.Request) { + //fmt.Fprintf(w, "%s %s \n", r.Method, r.URL) + //fmt.Fprintf(w, "%s \n", r.URL.RawQuery) + + //check if worksafe+https cookie enabled. + filterHTTPS := false + worksafe := true + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //Get the raw query + m, _ := url.ParseQuery(r.URL.RawQuery) + //Get the query parameters (q and o) + //fmt.Fprintf(w,"%s\n%s\n", m["q"][0], m["o"][0]) + + json := false + if strings.Contains(r.URL.Path, "/json") { + json = true + if _, ok := m["nsfw"]; ok { //check if &nsfw added to json url + worksafe = false + } + } + + query := "" + queryNoQuotes := "" + + offset := "0" + page := "0" + + //Check if query and page params exist + if _, ok := m["q"]; ok { + query = m["q"][0] + query = strings.Replace(query, "'", "''", -1) + query = strings.Replace(query, "- ", " ", -1) + queryNoQuotes = query + } + if _, ok := m["p"]; ok {//gets page num, will convert to offset further down + page = m["p"][0] + page = strings.Replace(page, "'", "''", -1) + offset = page + } + + lim := "12" + + if query == "" { //what do if no query found? + //load index if no query detected + if r.URL.Path == "/" { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } else if strings.Contains(r.URL.Path, "/json") { //load json info page if json selected + p := indexPage{} + t, _ := template.ParseFiles("coreassets/json/json.html.go") + t.Execute(w, p) + } else { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } + } else { + + //Make sure offset is a number + offsetInt, err := strconv.Atoi(offset) + if err != nil { + offset = "0" + offsetInt = 0 + } + + //Make sure page is a number + pageInt, err := strconv.Atoi(page) + if err != nil { + page = "0" + pageInt = 0 + } + + //Convert lim to number + limInt, _ := strconv.Atoi(lim) + + //convert page num to offset + if offsetInt > 0 { + offsetInt --; + } + offsetInt = offsetInt * limInt + offset = strconv.Itoa(offsetInt) + + + //get some details from the raw query + var additions string + querylen := len(query) + + //see if a search redirect (! or &) is used for a different search engine + if json == false && (strings.Contains(m["q"][0],"!") || strings.Contains(m["q"][0],"&")){ + searchredirect(w, r, m["q"][0]) + } + + //phone users + if query[querylen-1] == ' '{ + query = query[:querylen-1] + queryNoQuotes = queryNoQuotes[:len(queryNoQuotes)-1] + querylen = len(query) + } + if querylen > 1 && query[0] == ' '{ + query = query[1:querylen] + queryNoQuotes = queryNoQuotes[1:len(queryNoQuotes)] + querylen = len(query) + } + + //check if user wants to limit search to a specific website + sitePos := -1 + siteEnd := 0 + siteURL := "" + if strings.Index(strings.ToLower(query), "site:") > -1 { + //get url user wants to search and remove it from the query stringre + sitePos = strings.Index(strings.ToLower(query), "site:") + siteEnd = strings.Index(query[sitePos:], " ") + //fmt.Printf("\n%d\n%d\n",sitePos,siteEnd) + if siteEnd > -1 && sitePos > 1 { //site is not last part of query + siteURL = query[sitePos+5 : siteEnd+sitePos] + query = query[:sitePos-1] + query[siteEnd+sitePos:] + queryNoQuotes = queryNoQuotes[:sitePos-1] + queryNoQuotes[siteEnd+sitePos:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd > -1 && sitePos == 0 { //site is at beginning + siteURL = query[sitePos+5 : siteEnd] + query = query[siteEnd+1:] + queryNoQuotes = queryNoQuotes[siteEnd+1:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd < 0 && sitePos > 1 { //site is at end + siteURL = query[sitePos+5:] + query = query[:sitePos-1] + queryNoQuotes = queryNoQuotes[:sitePos-1] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + }else if querylen > 5{ + query = query[5:] + } + querylen = len(query) + } + //fmt.Printf("Addition: \n%s\nQuery: '%s'\n",additions,query) + + //see if user uses -https flag (instead of cookie settings option) + if querylen > 7 && strings.ToLower(query[querylen-7:querylen]) == " -https" { + filterHTTPS = true + query = query[0 : querylen-7] + querylen = len(query) + } + + //check if user wants to search within a time window (day,week,month) + option := "" + //fmt.Printf("\n'%s'\n",query) + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:querylen], " ") == -1 { //option is at end of query + option = query[location+2 : querylen] + query = query[:location] + queryNoQuotes = queryNoQuotes[:location] + querylen = len(query) + }else if querylen > 0 && (query[0] == '!' || query[0] == '&') && strings.Index(query, " ") > -1{ //option is at start of query + option = query[1:strings.Index(query, " ")] + query = query[strings.Index(query, " ")+1:] + queryNoQuotes = queryNoQuotes[strings.Index(queryNoQuotes, " ")+1:] + querylen = len(query) + } + option = strings.ToLower(option) + if option != "" { + if option == "td" { //day + additions = additions + "AND date > NOW() - INTERVAL 1 DAY " + } else if option == "tw" { //week + additions = additions + "AND date > NOW() - INTERVAL 7 DAY " + } else if option == "tm" { //month + additions = additions + "AND date > NOW() - INTERVAL 30 DAY " + } else if option == "ty" { //year + additions = additions + "AND date > NOW() - INTERVAL 365 DAY " + } + } + + //check if worksafe and filterHTTPS flags set + if worksafe == true { + additions = additions + "AND worksafe = '1' " + } + if filterHTTPS == true { + additions = additions + "AND http = '1' " + } + + //search if query has quotes and remove them (so we can find the longest word in the query) + exactMatch := false + //queryNoQuotes := query + if strings.Contains(query, "\"") { + exactMatch = true + queryNoQuotes = strings.Replace(queryNoQuotes, "\"", "", -1) + } + + //remove the '*' if contained anywhere in queryNoQuotes + if strings.Contains(queryNoQuotes, "*") && exactMatch == false { + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + } + + //Prepare to find longest word in query + words := strings.Split(queryNoQuotes, " ") + longestWordLength := 0 + longestWord := "" + wordcount := 0 + longestwordelementnum := 0 + queryNoQuotesOrFlags := queryNoQuotes + requiredword := "" + flags := "" + flagssetbyuser := 0 + wordlen := 0 + numRequiredWords := 0 + //queryNoFlags := "" + + //first remove any flags inside var queryNoQuotes, also grab any required words (+ prefix) + if strings.Contains(queryNoQuotes, "-") || strings.Contains(queryNoQuotes, "+") { + queryNoQuotesOrFlags = "" + for i, wordNoFlags := range words { + if i > 0 && strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { //add a space after + queryNoQuotesOrFlags += " " + } + if strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { + queryNoQuotesOrFlags += wordNoFlags + } + if strings.HasPrefix(wordNoFlags, "+") == true && len(wordNoFlags) > 1 && requiredword == "" { //get requiredword + requiredword = wordNoFlags[1:len(wordNoFlags)] + } + if i > 0 && strings.HasPrefix(wordNoFlags, "-") == true || strings.HasPrefix(wordNoFlags, "+") == true { + flags += " " + wordNoFlags + flagssetbyuser++ + if strings.HasPrefix(wordNoFlags, "+") == true { + numRequiredWords++ + } + } + } + flags = checkformat(flags) + } + //fmt.Printf("\n%s",flags) + + //now find longest word + words = strings.Split(queryNoQuotesOrFlags, " ") + for _, word := range words { + if len(word) > longestWordLength { + longestWordLength = len(word) + longestWord = word + longestwordelementnum = wordcount + } + if word != ""{ + wordcount++ + } + } + + //create another query where all compatible words are marked as required + reqwordQuery := "" + for i, word := range words{ + wordlen = len(word) + if i==0 && (strings.HasPrefix(word, "+") == true || strings.HasPrefix(word, "-") == true) && wordlen > 3{ + reqwordQuery += word + } + if i==0 && (strings.HasPrefix(word, "+") == false && strings.HasPrefix(word, "-") == false) { + if wordlen > 2 { + reqwordQuery += "+" + } + reqwordQuery += word + } + if i!=0 && (strings.HasPrefix(word, "+") == true || strings.HasPrefix(word, "-") == true) && wordlen > 3{ + reqwordQuery += " " + reqwordQuery += word + } + if i!=0 && (strings.HasPrefix(word, "+") == false && strings.HasPrefix(word, "-") == false) { + reqwordQuery += " " + if wordlen > 2 { + reqwordQuery += "+" + } + reqwordQuery += word + } + } + reqwordQuery = checkformat(reqwordQuery) + reqwordQuery += flags + + //fmt.Printf("\n%s",reqwordQuery) + + //get copy of original query because we might have to modify it further + queryOriginal := query + + tRes := MySQLResults{} + var res = PageData{} + + //init the db and set charset + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, p) + } + defer db.Close() + + // Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //Check if query is a url. + urlDetected := false + isURL := "" + if strings.Index(query, " ") == -1 && strings.Index(query, "\"") == -1 && strings.Index(query, ".") > -1 { //note this will also flag on file extensions + if len(query) > 6 && (query[0:7] == "http://" || query[0:7] == "HTTP://") { + query = query[7:] + } else if len(query) > 7 && (query[0:8] == "https://" || query[0:8] == "HTTPS://") { + query = query[8:] + } + if len(queryNoQuotes) > 6 && (queryNoQuotes[0:7] == "http://" || queryNoQuotes[0:7] == "HTTP://") { + queryNoQuotes = queryNoQuotes[7:] + } else if len(queryNoQuotes) > 7 && (queryNoQuotes[0:8] == "https://" || queryNoQuotes[0:8] == "HTTPS://") { + queryNoQuotes = queryNoQuotes[8:] + } + query = "\"" + query + "\"" + urlDetected = true + isURL = "WHEN MATCH(url) AGAINST('\"" + queryNoQuotes + "\"' IN BOOLEAN MODE) THEN 25" + } + + //if no required words set, make the longest word in the query required. + querywithrequiredword := "" + if numRequiredWords == 0 && wordcount > 1 && longestWordLength > 2{ + querywithrequiredword = query + " +" + querywithrequiredword = querywithrequiredword + longestWord + } + + + //fmt.Printf(">%s<\n", reqwordQuery) + queryWithQuotesAndFlags := "\"" + queryNoQuotesOrFlags + "\"" + flags + queryWithQuotes := "\"" + queryNoQuotesOrFlags + "\"" + + //if query is just 1 or 2 letters, help make it work. + if utf8.RuneCountInString(queryOriginal) < 3 { + queryfix := "" + query + "*" + query = queryfix + queryWithQuotesAndFlags = queryfix + reqwordQuery = queryfix + } + if strings.Contains(queryOriginal,"c++")==true || strings.Contains(queryOriginal,"C++")==true{ // :) :( :) :( + exactMatch=true + queryWithQuotesAndFlags += " +programming" + if strings.Contains(queryOriginal," ")==true && longestWordLength>3{ + queryWithQuotesAndFlags += " +" + queryWithQuotesAndFlags += longestWord + } + } + + querytouse := query + if querywithrequiredword != ""{ + querytouse = querywithrequiredword + }else if numRequiredWords > 0{ + querytouse = reqwordQuery + } + + if exactMatch == false && urlDetected == false { + querytouse = checkformat(querytouse) + reqwordQuery = checkformat(reqwordQuery) + } + + //perform full text search FOR InnoDB STORAGE ENGINE or MyISAM + var sqlQuery, id, url, title, description, body string + + if (exactMatch==false || flagssetbyuser > 0) && urlDetected==false && strings.Index(query, " ") != -1 && flagssetbyuser + wordcount != flagssetbyuser{ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('" + querytouse + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) OR MATCH(description) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) THEN 15 WHEN MATCH(title) AGAINST('" + reqwordQuery + "' IN BOOLEAN MODE) THEN 14 WHEN MATCH(title) AGAINST('" + querytouse + "' IN BOOLEAN MODE) THEN 13 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + }else{ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 20 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + } + rows, err := db.Query(sqlQuery) + //fmt.Printf("\n%s\n",sqlQuery) + if err != nil { + res.Page = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + //p := indexPage{} + //t, _ := template.ParseFiles("coreassets/form.html.go") + //t.Execute(w, p) + return + } + + if urlDetected == true { + query = queryOriginal + } + + count := 0 + wordtocheck := "" + stringtofind := strings.ToLower(queryNoQuotesOrFlags) + stringtofind = strings.Replace(stringtofind, "''", "'", -1) + requiredwordtofind := strings.ToLower(requiredword) + requiredwordtofind = strings.Replace(requiredwordtofind, "''", "'", -1) + longestWordtofind := strings.ToLower(longestWord) + longestWordtofind = strings.Replace(longestWordtofind, "''", "'", -1) + + for rows.Next() { + count++ + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //find query inside body of page + if exactMatch == false && (numRequiredWords == 0 || numRequiredWords + wordcount == numRequiredWords){ + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), requiredwordtofind) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), stringtofind) + } + + if pos == -1 { //not found? find position of longest query word + pos = strings.Index(strings.ToLower(body), longestWordtofind) + //not found?, set position to a different word + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + //wordtocheck = strings.Replace(words[0], "*", "", -1) + wordtocheck = strings.Replace(words[0], "''", "'", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(wordtocheck)) + } + if longestwordelementnum == 0 { + //wordtocheck = strings.Replace(words[1], "*", "", -1) + wordtocheck = strings.Replace(words[1], "''", "'", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(wordtocheck)) + } + } + } + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), stringtofind) + } + + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows.Close() + rows.Close() + //================================================================================================================================ + //no results found (count==0), so do a wildcard search (repeat the above process) - this section will probably be removed, no longer useful + addWildcard := false + /*if count == 0 && offset == "0" && urlDetected == false && exactMatch == false { + addWildcard = true + query = strings.Replace(query, "\"", "", -1) //remove some things innodb gets fussy over + query = strings.Replace(query, "*", "", -1) + query = strings.Replace(query, "'", "", -1) + queryNoQuotes = strings.Replace(queryNoQuotes, "\"", "", -1) + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + queryNoQuotes = strings.Replace(queryNoQuotes, "'", "", -1) + query = query + "*" + + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 30 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + rows2, err := db.Query(sqlQuery) + if err != nil { + res.Page = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + //p := indexPage{} + //t, _ := template.ParseFiles("coreassets/form.html.go") + //t.Execute(w, p) + return + } + + for rows2.Next() { + count++ + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows2.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //find query inside body of page + if exactMatch == false && (numRequiredWords == 0 || numRequiredWords + wordcount == numRequiredWords){ + //remove the '*' if contained anywhere in query + //if strings.Contains(queryNoQuotes,"*"){ + // queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + //} + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), strings.ToLower(requiredword)) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotesOrFlags)) + } + if pos == -1 { //Not found? prepare to find position of longest query word within body + //remove the '*' at the end of the longest word if present + if strings.Contains(longestWord, "*") { + longestWord = strings.Replace(longestWord, "*", "", -1) + } + //search within body for position of longest query word. + pos = strings.Index(strings.ToLower(body), strings.ToLower(longestWord)) + //not found?, set position to a different word, make sure there's no wildcard on it + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + words[0] = strings.Replace(words[0], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[0])) + } + if longestwordelementnum == 0 { + words[1] = strings.Replace(words[1], "*", "", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(words[1])) + } + } + } + + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), strings.ToLower(queryNoQuotesOrFlags)) + } + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows2.Close() + rows2.Close() + } + //======================================================================================================================= + //http://go-database-sql.org/retrieving.html +*/ + //Close DB + db.Close() + + //If results = lim, allow the find more link + if count >= limInt && addWildcard == false{ + res.FindMore = true + } else { + res.FindMore = false + } + + if(pageInt == 0){ + pageInt+=2 + }else{ + pageInt++; + } + res.Page = strconv.Itoa(pageInt) + res.Query = m["q"][0] //get original unsafe query + + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + } +} + +func settings(w http.ResponseWriter, r *http.Request) { + //setup for error report + error := errorReport{} + + //check if worksafe (adult content) cookie enabled. + filterHTTPS := false + worksafe := true + worksafewasoff := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //check if and what is the user posting + switch r.Method { + case "POST": + if err := r.ParseForm(); err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + worksafebox := r.Form.Get("worksafe") + agreecheck := r.Form.Get("agree") + agreesubmit := r.Form.Get("agreesubmit") + httpsbox := r.Form.Get("filterHTTPS") + + //if user agrees to terms to disable adult content, set cookie and return to index + if agreecheck == "on" { + worksafe = false + //expiration := time.Now().Add(365 * 24 * time.Hour) + if filterHTTPS == false { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox is checked, return to index with worksafe on + } else if worksafebox == "on" || agreesubmit == "on" { + //expiration := time.Now().Add(365 * 24 * time.Hour) + if httpsbox != "on" { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox unchecked and no cookie, go to content agreement section + } else if worksafebox != "on" && worksafewasoff == false && agreesubmit != "on" { + p := indexPage{} + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } + t, _ := template.ParseFiles("coreassets/settings/agree.html.go") + t.Execute(w, p) + //else if worksafebox unchecked and cookie alredy agreed, go back to index + } else if worksafebox != "on" && worksafewasoff == true { + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + } + default: + //load the settings page if no post value + settingspage := settingsPage{} + settingspage.Worksafe = worksafe + settingspage.FilterHTTPS = filterHTTPS + t, _ := template.ParseFiles("coreassets/settings/settings.html.go") + t.Execute(w, settingspage) + } +} + +func surprise(w http.ResponseWriter, r *http.Request) { + surprise := surpriseURL{} + + //check if worksafe+HTTPS cookie enabled. + filterHTTPS := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //init the db and set charset + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + defer db.Close() + // Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //grab a random page + var sqlQuery string + if filterHTTPS == false { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 order by rand() limit 1" + } else { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 and http = 1 order by rand() limit 1" + } + rows, err := db.Query(sqlQuery) + + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + var url string + for rows.Next() { + err := rows.Scan(&url) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + surprise.Url = url + } + defer rows.Close() + rows.Close() + db.Close() + t, _ := template.ParseFiles("coreassets/surprise.html.go") + t.Execute(w, surprise) +} + +func MysqlRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "'": `\'`, "\\0": "\\\\0", "\n": "\\n", "\r": "\\r", `"`: `\"`, "\x1a": "\\Z"} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + return value +} +func JSONRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "\t": "\\t", "\b": "\\b", "\n": "\\n", "\r": "\\r", "\f": "\\f" /*, `"`:`\"`*/} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + //remove control characters + buf := []rune(value) + for i, v := range buf { + if v < 32 || v == 127 { + buf[i]=32 + } + } + + return string(buf) +} +func substr(s string, start int, end int) string { + start_str_idx := 0 + i := 0 + for j := range s { + if i == start { + start_str_idx = j + } + if i == end { + return s[start_str_idx:j] + } + i++ + } + return s[start_str_idx:] +} +func checkformat(query string) string{ + //Check if query contains a hyphenated word. Replace hyphens with a space, drop at hyphen if set as required word. + if strings.Contains(query, "-") || strings.Contains(query, "+") { + hyphenwords := strings.Split(query, " ") + query = "" + quotes := 0 + for i, word := range hyphenwords { + if strings.Contains(word, "\"") { + quotes++ + } + if (strings.Contains(word, "-") || strings.Contains(word, "+")) && word[0] != '-' && word[0] != '+' && quotes%2 == 0 { //if hyphen or plus exists, not a flag, not wrapped in quotes already + word = strings.Replace(word, "-", " ", -1) + }else if strings.Contains(word, "-") && (word[0] == '+') { //if hyphen exists and is a required word + word = strings.Replace(word, "-", " ", -1) + spos := strings.Index(word, " ") + if spos != -1 { + word = word[:spos] + } + if spos < 4 && spos > 0 { + word = "" + } + } + if len(word)>1 && word[0] == '+' && len(word)<4{ + word = word[1:] + } + if i > 0 { + query += " " + } + query += word + } + } + return query +} + +func searchredirect(w http.ResponseWriter, r *http.Request, query string) { + //separate actual query from search redirect + actualquery := "" + redirect := "" + lenquery := len(query) + if strings.Index(query," ") > -1{ + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:lenquery], " ") == -1 { //redirect is at end of query + redirect = query[location+2 : lenquery] + actualquery = query[:location] + } else if (strings.Index(query, "!") == 0 || strings.Index(query, "&") == 0){ //redirect is at start of query + redirect = query[1:strings.Index(query, " ")] + actualquery = query[strings.Index(query, " ")+1:] + //fmt.Printf("\nRedirect: %s\nquery: %s\n",redirect,actualquery) + } + redirect = strings.ToLower(redirect) + }else if (query[0] == '!' || query[0] == '&') && lenquery > 1{ + redirect = query[1:] + } + if redirect != "" { + //determine which search engine to redirect + if redirect == "g" { //if google text search + http.Redirect(w, r, "http://google.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "b" { //if bing text search + http.Redirect(w, r, "http://bing.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gi" { //if google image search + http.Redirect(w, r, "http://www.google.com/search?tbm=isch&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bi" { //if bing image search + http.Redirect(w, r, "http://www.bing.com/images/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gv" { //if google video search + http.Redirect(w, r, "http://www.google.com/search?tbm=vid&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bv" { //if bing video search + http.Redirect(w, r, "http://www.bing.com/videos/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gm" { //if google maps search + http.Redirect(w, r, "http://www.google.com/maps/search/"+actualquery, http.StatusSeeOther) + } else if redirect == "bm" { //if bing maps search + http.Redirect(w, r, "http://www.bing.com/maps?q="+actualquery, http.StatusSeeOther) + }/* else { + http.Redirect(w, r, "/?q="+actualquery, http.StatusSeeOther) + }*/ + } +} diff --git a/go/core/core b/go/core/core new file mode 100755 index 0000000..154411a Binary files /dev/null and b/go/core/core differ diff --git a/go/core/core.go b/go/core/core.go new file mode 100755 index 0000000..8e265b7 --- /dev/null +++ b/go/core/core.go @@ -0,0 +1,1324 @@ +package main + +import ( + "database/sql" + "fmt" + _ "github.com/go-sql-driver/mysql" + "html" + "html/template" + "io/ioutil" + "log" + "net/http" + "net/url" + "strconv" + "strings" + "unicode/utf8" + // "sync" + // "time" +) + +type indexPage struct{} +type errorReport struct{ Error string } +type surpriseURL struct{ Url string } +type settingsPage struct{ Worksafe, FilterHTTPS bool } +type MySQLResults struct{ Id, Url, Title, Description, Body string } +type PageData struct { + DBResults []MySQLResults + Query, Page string + FindMore bool +} + +func main() { + http.HandleFunc("/", handler) + http.HandleFunc("/json", handler) + http.HandleFunc("/json/", handler) + http.HandleFunc("/surprise", surprise) + http.HandleFunc("/surprise/", surprise) + http.HandleFunc("/settings/", settings) + http.HandleFunc("/settings", settings) + log.Fatal(http.ListenAndServe("0.0.0.0:8080", nil)) //set IP to localhost if reverse proxy is on the same machine +} + +//https://golang.org/pkg/net/http/#Request +func handler(w http.ResponseWriter, r *http.Request) { + //fmt.Fprintf(w, "%s %s \n", r.Method, r.URL) + //fmt.Fprintf(w, "%s \n", r.URL.RawQuery) + + //Indicate whether or not you are using shard tables + shards := true + + //check if worksafe+https cookie enabled. + filterHTTPS := false + worksafe := true + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //Get the raw query + m, _ := url.ParseQuery(r.URL.RawQuery) + //Get the query parameters (q and o) + //fmt.Fprintf(w,"%s\n%s\n", m["q"][0], m["o"][0]) + + json := false + if strings.Contains(r.URL.Path, "/json") { + json = true + if _, ok := m["nsfw"]; ok { //check if &nsfw added to json url + worksafe = false + } + } + + query := "" + queryNoQuotes := "" + + offset := "0" + page := "0" + + //Check if query and page params exist + if _, ok := m["q"]; ok { + query = m["q"][0] + query = strings.Replace(query, "'", "''", -1) + query = strings.Replace(query, "- ", " ", -1) + queryNoQuotes = query + } + if _, ok := m["p"]; ok {//gets page num, will convert to offset further down + page = m["p"][0] + page = strings.Replace(page, "'", "''", -1) + offset = page + } + + lim := "12" + // limDistributedInt := + + if query == "" { //what do if no query found? + //load index if no query detected + if r.URL.Path == "/" { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } else if strings.Contains(r.URL.Path, "/json") { //load json info page if json selected + p := indexPage{} + t, _ := template.ParseFiles("coreassets/json/json.html.go") + t.Execute(w, p) + } else { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/form.html.go") + t.Execute(w, p) + } + } else { + + //Make sure offset is a number + offsetInt, err := strconv.Atoi(offset) + if err != nil { + offset = "0" + offsetInt = 0 + } + + //Make sure page is a number + pageInt, err := strconv.Atoi(page) + if err != nil { + page = "0" + pageInt = 0 + } + + //Convert lim to number + limInt, _ := strconv.Atoi(lim) + + //convert page num to offset + if offsetInt > 0 { + offsetInt --; + } + offsetInt = offsetInt * limInt + offset = strconv.Itoa(offsetInt) + + //get some details from the raw query + var additions string + querylen := len(query) + + //see if a search redirect (! or &) is used for a different search engine + if json == false && (strings.Contains(m["q"][0],"!") || strings.Contains(m["q"][0],"&")){ + searchredirect(w, r, m["q"][0]) + } + + //phone users + if query[querylen-1] == ' '{ + query = query[:querylen-1] + queryNoQuotes = queryNoQuotes[:len(queryNoQuotes)-1] + querylen = len(query) + } + if querylen > 1 && query[0] == ' '{ + query = query[1:querylen] + queryNoQuotes = queryNoQuotes[1:len(queryNoQuotes)] + querylen = len(query) + } + + //check if user wants to limit search to a specific website + sitePos := -1 + siteEnd := 0 + siteURL := "" + if strings.Index(strings.ToLower(query), "site:") > -1 { + //get url user wants to search and remove it from the query string + sitePos = strings.Index(strings.ToLower(query), "site:") + siteEnd = strings.Index(query[sitePos:], " ") + //fmt.Printf("\n%d\n%d\n",sitePos,siteEnd) + if siteEnd > -1 && sitePos > 1 { //site is not last part of query + siteURL = query[sitePos+5 : siteEnd+sitePos] + query = query[:sitePos-1] + query[siteEnd+sitePos:] + queryNoQuotes = queryNoQuotes[:sitePos-1] + queryNoQuotes[siteEnd+sitePos:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd > -1 && sitePos == 0 { //site is at beginning + siteURL = query[sitePos+5 : siteEnd] + query = query[siteEnd+1:] + queryNoQuotes = queryNoQuotes[siteEnd+1:] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + } else if siteEnd < 0 && sitePos > 1 { //site is at end + siteURL = query[sitePos+5:] + query = query[:sitePos-1] + queryNoQuotes = queryNoQuotes[:sitePos-1] + additions = additions + "AND url LIKE '%" + siteURL + "%' " + }else if querylen > 5{ + query = query[5:] + } + querylen = len(query) + } + //fmt.Printf("Addition: \n%s\nQuery: '%s'\n",additions,query) + + //see if user uses -https flag (instead of cookie settings option) + if querylen > 7 && strings.ToLower(query[querylen-7:querylen]) == " -https" { + filterHTTPS = true + query = query[0 : querylen-7] + querylen = len(query) + } + + //check if user wants to search within a time window (day,week,month) + option := "" + //fmt.Printf("\n'%s'\n",query) + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:querylen], " ") == -1 { //option is at end of query + option = query[location+2 : querylen] + query = query[:location] + queryNoQuotes = queryNoQuotes[:location] + querylen = len(query) + }else if querylen > 0 && (query[0] == '!' || query[0] == '&') && strings.Index(query, " ") > -1{ //option is at start of query + option = query[1:strings.Index(query, " ")] + query = query[strings.Index(query, " ")+1:] + queryNoQuotes = queryNoQuotes[strings.Index(queryNoQuotes, " ")+1:] + querylen = len(query) + } + option = strings.ToLower(option) + if option != "" { + if option == "td" { //day + additions = additions + "AND date > NOW() - INTERVAL 1 DAY " + } else if option == "tw" { //week + additions = additions + "AND date > NOW() - INTERVAL 7 DAY " + } else if option == "tm" { //month + additions = additions + "AND date > NOW() - INTERVAL 30 DAY " + } else if option == "ty" { //year + additions = additions + "AND date > NOW() - INTERVAL 365 DAY " + } + } + + //check if worksafe and filterHTTPS flags set + if worksafe == true { + additions = additions + "AND worksafe = '1' " + } + if filterHTTPS == true { + additions = additions + "AND http = '1' " + } + + //search if query has quotes and remove them (so we can find the longest word in the query) + exactMatch := false + //queryNoQuotes := query + if strings.Contains(query, "\"") { + exactMatch = true + queryNoQuotes = strings.Replace(queryNoQuotes, "\"", "", -1) + //fmt.Printf("%s \n", queryNoQuotes) + } + + //remove the '*' if contained anywhere in queryNoQuotes + if strings.Contains(queryNoQuotes, "*") && exactMatch == false { + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + } + + //Prepare to find longest word in query + words := strings.Split(queryNoQuotes, " ") + longestWordLength := 0 + longestWord := "" + wordcount := 0 + longestwordelementnum := 0 + queryNoQuotesOrFlags := queryNoQuotes + requiredword := "" + flags := "" + flagssetbyuser := 0 + wordlen := 0 + numRequiredWords := 0 + //queryNoFlags := "" + + //first remove any flags inside var queryNoQuotes, also grab any required words (+ prefix) + if strings.Contains(queryNoQuotes, "-") || strings.Contains(queryNoQuotes, "+") { + queryNoQuotesOrFlags = "" + for i, wordNoFlags := range words { + if i > 0 && strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { //add a space after + queryNoQuotesOrFlags += " " + } + if strings.HasPrefix(wordNoFlags, "-") == false && strings.HasPrefix(wordNoFlags, "+") == false { + queryNoQuotesOrFlags += wordNoFlags + } + if strings.HasPrefix(wordNoFlags, "+") == true && len(wordNoFlags) > 1 && requiredword == "" { //get requiredword + requiredword = wordNoFlags[1:len(wordNoFlags)] + } + if i > 0 && strings.HasPrefix(wordNoFlags, "-") == true || strings.HasPrefix(wordNoFlags, "+") == true { + flags += " " + wordNoFlags + flagssetbyuser++ + if strings.HasPrefix(wordNoFlags, "+") == true { + numRequiredWords++ + } + } + } + flags = checkformat(flags) + } + //now find longest word, and build extra locate statements for partial matches (when sorting results returned from replicas) + partialLocate := "" + locateWords := false + words = strings.Split(queryNoQuotesOrFlags, " ") + for _, word := range words { + if len(word) > longestWordLength { + longestWordLength = len(word) + longestWord = word + longestwordelementnum = wordcount + } + if wordcount < 5 && len(word) > 3{ + if locateWords == false { + partialLocate += " WHEN LOCATE('" + word + "', title) " + }else{ + partialLocate += "OR LOCATE('" + word + "', title) " + } + locateWords=true + } + if word != ""{ + wordcount++ + } + } + if locateWords == true{ + partialLocate += "THEN 10" + } + + //fmt.Printf("\n%s",partialLocate) + + //create another query where all compatible words are marked as required + reqwordQuery := "" + for i, word := range words{ + wordlen = len(word) + if i==0 && (strings.HasPrefix(word, "+") == true || strings.HasPrefix(word, "-") == true) && wordlen > 3{ + reqwordQuery += word + } + if i==0 && (strings.HasPrefix(word, "+") == false && strings.HasPrefix(word, "-") == false) { + if wordlen > 2 { + reqwordQuery += "+" + } + reqwordQuery += word + } + if i!=0 && (strings.HasPrefix(word, "+") == true || strings.HasPrefix(word, "-") == true) && wordlen > 3{ + reqwordQuery += " " + reqwordQuery += word + } + if i!=0 && (strings.HasPrefix(word, "+") == false && strings.HasPrefix(word, "-") == false) { + reqwordQuery += " " + if wordlen > 2 { + reqwordQuery += "+" + } + reqwordQuery += word + } + } + reqwordQuery = checkformat(reqwordQuery) + reqwordQuery += flags + + //fmt.Fprintf(w,"%s\n%s\n", query,offset) + //fmt.Printf("hai\n") + + //get copy of original query because we might have to modify it somewhat + queryOriginal := query + + tRes := MySQLResults{} + var res = PageData{} + + //Check if query is a url. + urlDetected := false + isURL := "" + isURLlocate := "" + if strings.Index(query, " ") == -1 && strings.Index(query, "\"") == -1 && strings.Index(query, ".") > -1 { //note this will also flag on file extensions + if len(query) > 6 && (query[0:7] == "http://" || query[0:7] == "HTTP://") { + query = query[7:] + } else if len(query) > 7 && (query[0:8] == "https://" || query[0:8] == "HTTPS://") { + query = query[8:] + } + if len(queryNoQuotes) > 6 && (queryNoQuotes[0:7] == "http://" || queryNoQuotes[0:7] == "HTTP://") { + queryNoQuotes = queryNoQuotes[7:] + } else if len(queryNoQuotes) > 7 && (queryNoQuotes[0:8] == "https://" || queryNoQuotes[0:8] == "HTTPS://") { + queryNoQuotes = queryNoQuotes[8:] + } + query = "\"" + query + "\"" + urlDetected = true + isURL = "WHEN MATCH(url) AGAINST('\"" + queryNoQuotes + "\"' IN BOOLEAN MODE) THEN 25" + isURLlocate = "WHEN LOCATE('" + queryNoQuotesOrFlags + "', url) THEN 25" + } + + //if no required words set, make the longest word in the query required. + querywithrequiredword := "" + if numRequiredWords == 0 && wordcount > 1 && longestWordLength > 2{ + querywithrequiredword = query + " +" + querywithrequiredword = querywithrequiredword + longestWord + } + + //perform full text search FOR InnoDB or MyISAM + var sqlQuery, id, url, title, description, body, idList string + rangeOffset := 0 + serverCount := 0 + var servers []string + numServers := 0 + //parse res.csv + noservers := false + repLim, _ := strconv.Atoi(lim) + repOffset, _ := strconv.Atoi(offset) + repLimStr := "" + repOffsetStr := "" + shard := "" + noresults := 0 + repsearchfail := 0 + var idListChans []chan string + + oneword := false + if strings.Index(query, " ") == -1{ + oneword = true + } + + resourceFile, err := ioutil.ReadFile("res.csv") + if err != nil { + noservers = true + } else { + if len(resourceFile) < 2 { + noservers = true + } + } + + //this switches off use of multiple connections to process a one word query. Should remove this if the database grows significantly larger + /*if strings.Contains(query, " ") == false && oneletterquery == 0 { + noservers = true + }*/ + + queryWithQuotesAndFlags := "\"" + queryNoQuotesOrFlags + "\"" + flags + queryWithQuotes := "\"" + queryNoQuotesOrFlags + "\"" + + //if query is just 1 or 2 letters, help make it work. + if utf8.RuneCountInString(queryOriginal) < 3 { + queryfix := "" + query + "*" + query = queryfix + queryWithQuotesAndFlags = queryfix + reqwordQuery = queryfix + } + if strings.Contains(queryOriginal,"c++")==true || strings.Contains(queryOriginal,"C++")==true{ // :) :( :) :( + exactMatch=true + queryWithQuotesAndFlags += " +programming" + if strings.Contains(queryOriginal," ")==true && longestWordLength>3{ + queryWithQuotesAndFlags += " +" + queryWithQuotesAndFlags += longestWord + } + } + + querytouse := query + if querywithrequiredword != ""{ + querytouse = querywithrequiredword + }else if numRequiredWords > 0{ + querytouse = reqwordQuery + } + + if exactMatch == false && urlDetected == false { + querytouse = checkformat(querytouse) + reqwordQuery = checkformat(reqwordQuery) + } + + reqwordQuery_filtered := strings.Replace(reqwordQuery, "'", "", -1) + //For a less restrictive search, replace only the first instance of reqwordQuery_filtered with querytouse_filtered in the SQL query used when calling the distributedQuery go routine + querytouse_filtered := strings.Replace(querytouse, "'", "", -1) + queryWithQuotesAndFlags_filtered := strings.Replace(queryWithQuotesAndFlags, "'", "", -1) + queryWithQuotes_filtered := strings.Replace(queryWithQuotes, "'", "", -1) + + if noservers == false { + //send query to go routines. + resourceFilestring := string(resourceFile) + //just in case user is messing around res.csv with a text editor and the editor ads a line feed to the end of the file + if len(resourceFilestring) > 0 && resourceFilestring[len(resourceFilestring)-1] == byte('\n') { + resourceFilestring = resourceFilestring[0 : len(resourceFilestring)-1] + } + servers = strings.Split(resourceFilestring, "\n") + numServers = len(servers) + + if(shards == false){ + //numServers must divide evenly into lim, or lim must divide evenly into numservers + //if they do not, automatically adjust numServers until they divide evenly + + //calculate number of servers to use based on lim size + if limInt > numServers { + for limInt%numServers > 0 { + numServers -= 1 + } + } else if numServers > limInt { + for numServers%limInt > 0 { + numServers -= 1 + } + } + } + + //calculate limit and offset on distributed servers. + if numServers < limInt { + repLim = limInt / numServers + } else { + repLim = 1 + } + repOffset = offsetInt / numServers + + //calculate rangeOffset (offset for the range of returned results, important if numServers > 2*lim) + rangeOffset = offsetInt - (repOffset * numServers) + + repLimStr = strconv.Itoa(repLim) + repOffsetStr = strconv.Itoa(repOffset) + + //create a channel for each available server + for i := 0; i < numServers; i++ { + idListChans = append(idListChans, make(chan string)) + } + + for _, server := range servers { + serverSettings := strings.Split(server, ",") + if len(serverSettings) == 4 { //if line contains all 4 settings + //ip, database, startID, endID + //create SQL connection string //db, err := sql.Open("mysql", "remote_guest:d0gemuchw0w@tcp(192.168.1.xxx:3306)/wiby?charset=utf8mb4") + serverIP := serverSettings[0] + shard = serverSettings[1] + startID := serverSettings[2] + endID := serverSettings[3] + sqlString := "remote_guest:d0gemuchw0w@tcp(" + serverIP + ":3306)/wiby?charset=utf8mb4" + // fmt.Printf("%s %s %s %d\n",sqlString,startID,endID,numServers) + //send special distributed query, only need ID returned + if(shards==false){//depricated + /*if(exactMatch==false && urlDetected==false && oneword==false){ + sqlQuery = "SELECT id FROM windex WHERE id BETWEEN " + startID + " AND " + endID + " AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) AND Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 19 WHEN MATCH(title) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 16 WHEN MATCH(description) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 15 WHEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) THEN Match(title) AGAINST('" + query + "' IN BOOLEAN MODE) WHEN MATCH(body) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 1 WHEN MATCH(url) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 0 END DESC, id DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + }else{ + sqlQuery = "SELECT id FROM windex WHERE id BETWEEN " + startID + " AND " + endID + " AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 19 WHEN MATCH(description) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 15 WHEN MATCH(url) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 0 END DESC, id DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + }*/ + }else{ + if (exactMatch==false || flagssetbyuser > 0) && urlDetected==false && strings.Index(query, " ") != -1 && flagssetbyuser + wordcount != flagssetbyuser{ + sqlQuery = "SELECT id FROM " + shard + " WHERE MATCH(tags, body, description, title, url) AGAINST('" + reqwordQuery_filtered + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotes_filtered + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotes_filtered + "' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('" + queryWithQuotes_filtered + "' IN BOOLEAN MODE) OR MATCH(description) AGAINST('" + queryWithQuotes_filtered + "' IN BOOLEAN MODE) THEN 15 WHEN MATCH(title) AGAINST('" + reqwordQuery_filtered + "' IN BOOLEAN MODE) THEN 14 WHEN MATCH(title) AGAINST('" + querytouse_filtered + "' IN BOOLEAN MODE) THEN 13 END DESC, id DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + }else{ + sqlQuery = "SELECT id FROM " + shard + " WHERE MATCH(tags, body, description, title, url) AGAINST('" + queryWithQuotesAndFlags_filtered + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotesAndFlags_filtered + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotesAndFlags_filtered + "' IN BOOLEAN MODE) THEN 20 END DESC, id DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + } + } + go distributedQuery(sqlString, sqlQuery, startID, endID, idListChans[serverCount]) + serverCount++ + } + } + for i := 0; i < serverCount; i++ { + //wait for channels to complete and collect results + idList += <-idListChans[i] + } + if len(idList) > 0 { + switch strings.Contains(idList, "e") { + case true: + repsearchfail = 1 + default: + idList = idList[1:len(idList)] //trim the first comma in the list + } + } else { + noresults = 1 + } + //fmt.Printf("\nChan: %s",idList) + } + + //init the db and set charset + + //create SQL connection string + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + p := indexPage{} + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, p) + } + defer db.Close() + // If Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + count := 0 + countResults := 0 + var ids[] string + + //if all went well with replication servers, send query to master containing idList and use the rangeOffset + if numServers == serverCount && numServers > 0 && repsearchfail == 0 { + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE id IN (" + idList + ") AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotesOrFlags + "', tags) THEN 30 " + isURLlocate + " WHEN LOCATE('" + queryNoQuotesOrFlags + "', title) THEN 20 WHEN LOCATE('" + queryNoQuotesOrFlags + "', body) OR LOCATE('" + queryNoQuotesOrFlags + "', description) THEN 15" + partialLocate + " END DESC, id DESC LIMIT " + lim + " OFFSET " + strconv.Itoa(rangeOffset) + "" + } else { //else, if no replication servers or there was some sort of error, just search the database locally instead + if(exactMatch==false && urlDetected==false && oneword==false && flagssetbyuser + wordcount != flagssetbyuser){ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('" + querytouse + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) OR MATCH(description) AGAINST('" + queryWithQuotes + "' IN BOOLEAN MODE) THEN 15 WHEN MATCH(title) AGAINST('" + reqwordQuery + "' IN BOOLEAN MODE) THEN 14 WHEN MATCH(title) AGAINST('" + querytouse + "' IN BOOLEAN MODE) THEN 13 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + }else{ + if(shards==false){//depricated + /*sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 19 WHEN MATCH(description) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 15 WHEN MATCH(url) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 0 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + ""*/ + }else{ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 30 " + isURL + " WHEN MATCH(title) AGAINST('" + queryWithQuotesAndFlags + "' IN BOOLEAN MODE) THEN 20 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + } + } + } + //fmt.Printf("\n%s",sqlQuery) + switch noresults { //if noresults == 1, no results were found during search on active replication servers + case 0: + // Send the query + rows, err := db.Query(sqlQuery) + if err != nil { + fmt.Printf("\n%s", err) + res.Page = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + //p := indexPage{} + //t, _ := template.ParseFiles("coreassets/form.html.go") + //t.Execute(w, p) + return + } + + if urlDetected == true { + query = queryOriginal + } + + wordtocheck := "" + stringtofind := strings.ToLower(queryNoQuotesOrFlags) + stringtofind = strings.Replace(stringtofind, "''", "'", -1) + requiredwordtofind := strings.ToLower(requiredword) + requiredwordtofind = strings.Replace(requiredwordtofind, "''", "'", -1) + longestWordtofind := strings.ToLower(longestWord) + longestWordtofind = strings.Replace(longestWordtofind, "''", "'", -1) + + for rows.Next() { + count++ + countResults++ + + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + ids = append(ids,id) + + //find query inside body of page + if exactMatch == false && (numRequiredWords == 0 || numRequiredWords + wordcount == numRequiredWords){ + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), requiredwordtofind) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), stringtofind) + } + + if pos == -1 { //not found? find position of longest query word + pos = strings.Index(strings.ToLower(body), longestWordtofind) + //not found?, set position to a different word + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + //wordtocheck = strings.Replace(words[0], "*", "", -1) + wordtocheck = strings.Replace(words[0], "''", "'", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(wordtocheck)) + } + if longestwordelementnum == 0 { + //wordtocheck = strings.Replace(words[1], "*", "", -1) + wordtocheck = strings.Replace(words[1], "''", "'", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(wordtocheck)) + } + } + } + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), stringtofind) + } + + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows.Close() + rows.Close() + if count > 0 { //new search method may cause less than the limit of row results per page even if there are more results to come, so we force a full count + count = limInt + } + } //end switch + + //================================================================================================================================ + //no results found (count==0), so do a wildcard search (repeat the above process) - this section will probably be removed, no longer useful + addWildcard := false + /*if count == 0 && offset == "0" && urlDetected == false && exactMatch == false { + + addWildcard = true + query = strings.Replace(query, "\"", "", -1) //remove some things innodb gets fussy over + query = strings.Replace(query, "*", "", -1) + query = strings.Replace(query, "'", "", -1) + queryNoQuotes = strings.Replace(queryNoQuotes, "\"", "", -1) + queryNoQuotes = strings.Replace(queryNoQuotes, "*", "", -1) + queryNoQuotes = strings.Replace(queryNoQuotes, "'", "", -1) + query = query + "*" + + if shards == false{ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 30 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + }else{ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 30 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + } + + if repsearchfail == 0 && noservers == false { + serverCount = 0 + idList = "" + for _, server := range servers { + serverSettings := strings.Split(server, ",") + if len(serverSettings) == 4 { //if line contains all 4 settings + //ip, database, startID, endID + //create SQL connection string //db, err := sql.Open("mysql", "remote_guest:d0gemuchw0w@tcp(10.8.0.102:3306)/wiby?charset=utf8mb4") + serverIP := serverSettings[0] + shard := serverSettings[1] + startID := serverSettings[2] + endID := serverSettings[3] + sqlString := "remote_guest:d0gemuchw0w@tcp(" + serverIP + ":3306)/wiby?charset=utf8mb4" + //fmt.Printf("%s %s %s %d\n",sqlString,startID,endID,numServers) + + //send special distributed query, only need ID returned + if(shards==false){//depricated + sqlQuery = "SELECT id FROM windex WHERE id BETWEEN " + startID + " AND " + endID + " AND enable = '1' " + additions + " ORDER BY CASE WHEN MATCH(tags) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 30 END DESC, id DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + }else{ + sqlQuery = "SELECT id FROM " + shard + " WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 30 END DESC, id DESC LIMIT " + repLimStr + " OFFSET " + repOffsetStr + "" + } + go distributedQuery(sqlString, sqlQuery, startID, endID, idListChans[serverCount]) + serverCount++ + } + } + for i := 0; i < serverCount; i++ { + //wait for channels to complete and collect results + idList += <-idListChans[i] + } + if len(idList) > 0 { + switch strings.Contains(idList, "e") { + case true: + repsearchfail = 1 + default: + idList = idList[1:len(idList)] //trim the first comma in the list + } + } else { + noresults = 1 + } + //if all went well with replication servers, send query to local database containing idList and use the rangeOffset + if numServers == serverCount && numServers > 0 && repsearchfail == 0 { + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE id IN (" + idList + ") AND enable = '1' " + additions + "ORDER BY CASE WHEN LOCATE('" + queryNoQuotes + "', tags) THEN 30 END DESC, id DESC LIMIT " + lim + " OFFSET " + strconv.Itoa(rangeOffset) + "" + } else { //else, if no replication servers or there was some sort of error, search the whole local database instead + if shards == false{ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 30 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + }else{ + sqlQuery = "SELECT id, url, title, description, body FROM windex WHERE Match(tags, body, description, title, url) Against('" + query + "' IN BOOLEAN MODE) AND enable = '1' " + additions + "ORDER BY CASE WHEN MATCH(tags) AGAINST('" + query + "' IN BOOLEAN MODE) THEN 30 END DESC, id DESC LIMIT " + lim + " OFFSET " + offset + "" + } + } + } + + rows2, err := db.Query(sqlQuery) + if err != nil { + res.Page = strconv.Itoa(0) + res.Query = m["q"][0] //get original unsafe query + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + + return + } + + wordtocheck := "" + stringtofind := strings.ToLower(queryNoQuotesOrFlags) + stringtofind = strings.Replace(stringtofind, "''", "'", -1) + requiredwordtofind := strings.ToLower(requiredword) + requiredwordtofind = strings.Replace(requiredwordtofind, "''", "'", -1) + longestWordtofind := strings.ToLower(longestWord) + longestWordtofind = strings.Replace(longestWordtofind, "''", "'", -1) + + for rows2.Next() { + count++ + //this will get set if position of longest word of query is found within body + pos := -1 + + err := rows2.Scan(&id, &url, &title, &description, &body) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //find query inside body of page + if exactMatch == false && (numRequiredWords == 0 || numRequiredWords + wordcount == numRequiredWords){ + if len(requiredword) > 0 { //search for position of required word if any, else search for position of whole query + pos = strings.Index(strings.ToLower(body), requiredwordtofind) + } else if pos == -1 { + pos = strings.Index(strings.ToLower(body), stringtofind) + } + + if pos == -1 { //not found? find position of longest query word + pos = strings.Index(strings.ToLower(body), longestWordtofind) + //not found?, set position to a different word + if pos == -1 && wordcount > 1 { + if longestwordelementnum > 0 { + //wordtocheck = strings.Replace(words[0], "*", "", -1) + wordtocheck = strings.Replace(words[0], "''", "'", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(wordtocheck)) + } + if longestwordelementnum == 0 { + //wordtocheck = strings.Replace(words[1], "*", "", -1) + wordtocheck = strings.Replace(words[1], "''", "'", -1) + pos = strings.Index(strings.ToLower(body), strings.ToLower(wordtocheck)) + } + } + } + } else { //if exact match, find position of query within body + pos = strings.Index(strings.ToLower(body), stringtofind) + } + + //still not found?, set position to 0 + if pos == -1 { + pos = 0 + } + + //Adjust position for runes within body + pos = utf8.RuneCountInString(body[:pos]) + + starttext := 0 + //ballpark := 0 + ballparktext := "" + + //figure out how much preceding text to use + if pos < 32 { + starttext = 0 + } else if pos > 25 { + starttext = pos - 25 + } else if pos > 20 { + starttext = pos - 15 + } + + //total length of the ballpark + textlength := 180 + + //populate the ballpark + if pos >= 0 { + ballparktext = substr(body, starttext, starttext+textlength) + } //else{ ballpark = 0}//looks unused + + //find position of nearest Period + //foundPeriod := true + posPeriod := strings.Index(ballparktext, ". ") + starttext + 1 + + //find position of nearest Space + //foundSpace := true + posSpace := strings.Index(ballparktext, " ") + starttext + + //if longest word in query is after a period+space within ballpark, reset starttext to that point + if (pos - starttext) > posPeriod { + starttext = posPeriod + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else if pos > posSpace { //else if longest word in query is after a space within ballpark, reset starttext to that point + //else if(pos-starttext) > posSpace//else if longest word in query is after a space within ballpark, reset starttext to that point + starttext = posSpace + //populate the bodymatch + if (pos - starttext) >= 0 { + body = substr(body, starttext, starttext+textlength) + } else { + body = "" + } + } else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if (pos - starttext) >= 0 { + body = ballparktext + } else { + body = "" + } + } + + tRes.Id = id + tRes.Url = url + tRes.Title = html.UnescapeString(title) + tRes.Description = html.UnescapeString(description) + tRes.Body = html.UnescapeString(body) + if json == true { + tRes.Title = JSONRealEscapeString(tRes.Title) + tRes.Description = JSONRealEscapeString(tRes.Description) + tRes.Body = JSONRealEscapeString(tRes.Body) + } + res.DBResults = append(res.DBResults, tRes) + } + defer rows2.Close() + rows2.Close() + }*/ + //======================================================================================================================= + //http://go-database-sql.org/retrieving.html + + //Close DB + db.Close() + + //allow the find more link + if (countResults >= limInt || countResults > 2) && addWildcard == false{ + res.FindMore = true + } else { + res.FindMore = false + } + + if(pageInt == 0){ + pageInt+=2 + }else{ + pageInt++; + } + + res.Page = strconv.Itoa(pageInt) + res.Query = m["q"][0] //get original unsafe query + + if json { + w.Header().Set("Content-Type", "application/json") + t, _ := template.ParseFiles("coreassets/json/results.json.go") + t.Execute(w, res) + } else { + t, _ := template.ParseFiles("coreassets/results.html.go") + t.Execute(w, res) + } + } +} + +func settings(w http.ResponseWriter, r *http.Request) { + //setup for error report + error := errorReport{} + + //check if worksafe (adult content) cookie enabled. + filterHTTPS := false + worksafe := true + worksafewasoff := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "0" { + worksafe = false + filterHTTPS = false + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "1" { + worksafe = true + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + worksafe = false + filterHTTPS = true + worksafewasoff = true + } else if worksafeHTTPSCookie.Value == "3" { + worksafe = true + filterHTTPS = true + } + + //check if and what is the user posting + switch r.Method { + case "POST": + if err := r.ParseForm(); err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + worksafebox := r.Form.Get("worksafe") + agreecheck := r.Form.Get("agree") + agreesubmit := r.Form.Get("agreesubmit") + httpsbox := r.Form.Get("filterHTTPS") + + //if user agrees to terms to disable adult content, set cookie and return to index + if agreecheck == "on" { + worksafe = false + //expiration := time.Now().Add(365 * 24 * time.Hour) + if filterHTTPS == false { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox is checked, return to index with worksafe on + } else if worksafebox == "on" || agreesubmit == "on" { + //expiration := time.Now().Add(365 * 24 * time.Hour) + if httpsbox != "on" { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + //else if worksafebox unchecked and no cookie, go to content agreement section + } else if worksafebox != "on" && worksafewasoff == false && agreesubmit != "on" { + p := indexPage{} + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "3", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "1", Path: "/"} + http.SetCookie(w, &cookie) + } + t, _ := template.ParseFiles("coreassets/settings/agree.html.go") + t.Execute(w, p) + //else if worksafebox unchecked and cookie alredy agreed, go back to index + } else if worksafebox != "on" && worksafewasoff == true { + if httpsbox == "on" { + cookie := http.Cookie{Name: "ws", Value: "2", Path: "/"} + http.SetCookie(w, &cookie) + } else { + cookie := http.Cookie{Name: "ws", Value: "0", Path: "/"} + http.SetCookie(w, &cookie) + } + p := indexPage{} + t, _ := template.ParseFiles("coreassets/settings/gohome.html") + t.Execute(w, p) + } + default: + //load the settings page if no post value + settingspage := settingsPage{} + settingspage.Worksafe = worksafe + settingspage.FilterHTTPS = filterHTTPS + t, _ := template.ParseFiles("coreassets/settings/settings.html.go") + t.Execute(w, settingspage) + } +} + +func surprise(w http.ResponseWriter, r *http.Request) { + surprise := surpriseURL{} + + //check if worksafe+HTTPS cookie enabled. + filterHTTPS := false + worksafeHTTPSCookie, err := r.Cookie("ws") + if err != nil { + filterHTTPS = false + } else if worksafeHTTPSCookie.Value == "2" { + filterHTTPS = true + } else if worksafeHTTPSCookie.Value == "3" { + filterHTTPS = true + } + + //setup for error report + error := errorReport{} + + //init the db and set charset + db, err := sql.Open("mysql", "guest:qwer@/wiby?charset=utf8mb4") + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + defer db.Close() + // Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + + //grab a random page + var sqlQuery string + if filterHTTPS == false { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 order by rand() limit 1" + } else { + sqlQuery = "select url from windex where worksafe = 1 and surprise = 1 and http = 1 order by rand() limit 1" + } + rows, err := db.Query(sqlQuery) + + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + var url string + for rows.Next() { + err := rows.Scan(&url) + if err != nil { + error.Error = err.Error() + t, _ := template.ParseFiles("coreassets/error.html.go") + t.Execute(w, error) + } + surprise.Url = url + } + defer rows.Close() + rows.Close() + db.Close() + t, _ := template.ParseFiles("coreassets/surprise.html.go") + t.Execute(w, surprise) +} + +func MysqlRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "'": `\'`, "\\0": "\\\\0", "\n": "\\n", "\r": "\\r", `"`: `\"`, "\x1a": "\\Z"} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + return value +} +func JSONRealEscapeString(value string) string { + replace := map[string]string{"\\": "\\\\", "\t": "\\t", "\b": "\\b", "\n": "\\n", "\r": "\\r", "\f": "\\f" /*, `"`:`\"`*/} + + for b, a := range replace { + value = strings.Replace(value, b, a, -1) + } + + //remove control characters + buf := []rune(value) + for i, v := range buf { + if v < 32 || v == 127 { + buf[i]=32 + } + } + + return string(buf) +} + +func substr(s string, start int, end int) string { + start_str_idx := 0 + i := 0 + for j := range s { + if i == start { + start_str_idx = j + } + if i == end { + return s[start_str_idx:j] + } + i++ + } + return s[start_str_idx:] +} + +func checkformat(query string) string{ + //Check if query contains a hyphenated word. Replace hyphens with a space, drop at hyphen if set as required word. + if strings.Contains(query, "-") || strings.Contains(query, "+") { + hyphenwords := strings.Split(query, " ") + query = "" + quotes := 0 + for i, word := range hyphenwords { + if strings.Contains(word, "\"") { + quotes++ + } + if (strings.Contains(word, "-") || strings.Contains(word, "+")) && word[0] != '-' && word[0] != '+' && quotes%2 == 0 { //if hyphen or plus exists, not a flag, not wrapped in quotes already + word = strings.Replace(word, "-", " ", -1) + }else if strings.Contains(word, "-") && (word[0] == '+') { //if hyphen exists and is a required word + word = strings.Replace(word, "-", " ", -1) + spos := strings.Index(word, " ") + if spos != -1 { + word = word[:spos] + } + if spos < 4 && spos > 0 { + word = "" + } + } + if len(word)>1 && word[0] == '+' && len(word)<4{ + word = word[1:] + } + if i > 0 { + query += " " + } + query += word + } + } + return query +} + +func searchredirect(w http.ResponseWriter, r *http.Request, query string) { + //separate actual query from search redirect + actualquery := "" + redirect := "" + lenquery := len(query) + if strings.Index(query," ") > -1{ + location := strings.Index(query, " !") + if location == -1 { + location = strings.Index(query, " &") + } + if location > -1 && strings.Index(query[location+1:lenquery], " ") == -1 { //redirect is at end of query + redirect = query[location+2 : lenquery] + actualquery = query[:location] + } else if (strings.Index(query, "!") == 0 || strings.Index(query, "&") == 0){ //redirect is at start of query + redirect = query[1:strings.Index(query, " ")] + actualquery = query[strings.Index(query, " ")+1:] + //fmt.Printf("\nRedirect: %s\nquery: %s\n",redirect,actualquery) + } + redirect = strings.ToLower(redirect) + }else if (query[0] == '!' || query[0] == '&') && lenquery > 1{ + redirect = query[1:] + } + if redirect != "" { + //determine which search engine to redirect + if redirect == "g" { //if google text search + http.Redirect(w, r, "http://google.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "b" { //if bing text search + http.Redirect(w, r, "http://bing.com/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gi" { //if google image search + http.Redirect(w, r, "http://www.google.com/search?tbm=isch&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bi" { //if bing image search + http.Redirect(w, r, "http://www.bing.com/images/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gv" { //if google video search + http.Redirect(w, r, "http://www.google.com/search?tbm=vid&q="+actualquery, http.StatusSeeOther) + } else if redirect == "bv" { //if bing video search + http.Redirect(w, r, "http://www.bing.com/videos/search?q="+actualquery, http.StatusSeeOther) + } else if redirect == "gm" { //if google maps search + http.Redirect(w, r, "http://www.google.com/maps/search/"+actualquery, http.StatusSeeOther) + } else if redirect == "bm" { //if bing maps search + http.Redirect(w, r, "http://www.bing.com/maps?q="+actualquery, http.StatusSeeOther) + }/* else { + http.Redirect(w, r, "/?q="+actualquery, http.StatusSeeOther) + }*/ + } +} + +func distributedQuery(con string, sqlQuery string, startID string, endID string, idListChan chan<- string) { + var id string + var idList string + count := 0 + //defer wg.Done() + //init the db + db, err := sql.Open("mysql", con) + if err != nil { + idList = idList + "e" //will look for this when channels are processed + } + defer db.Close() + // If Open doesn't open a connection. Validate DSN data: + err = db.Ping() + if err != nil { + } + //fmt.Printf("%s\n", sqlQuery) + // Send the query + rows, err := db.Query(sqlQuery) + if err == nil { + for rows.Next() { + err := rows.Scan(&id) + if err != nil { + } + //idString = idstring + "id = " + id + " or " + idList += "," + id + count++ + } + } else { + idList = idList + "e" //will look for this when channels are processed + fmt.Printf("%s", err) + } + //fmt.Printf("%s - %s\n", idList,con) + idListChan <- idList +} diff --git a/go/core/coreassets/error.html.go b/go/core/coreassets/error.html.go new file mode 100755 index 0000000..334b411 --- /dev/null +++ b/go/core/coreassets/error.html.go @@ -0,0 +1,20 @@ +<!DOCTYPE html> + +<html> + + <head> + + <title>Wiby Error + + + + + + +

Wiby kaputnik :( ...

+ + + + diff --git a/go/core/coreassets/form.html.go b/go/core/coreassets/form.html.go new file mode 100755 index 0000000..e26c4cc --- /dev/null +++ b/go/core/coreassets/form.html.go @@ -0,0 +1,46 @@ + + + + Oohay! - Search Engine for Compu-Global-Hyper-Mega-Net + + + + + + + + + +

Oohay!


+
+
+ + +

+ +
+
+
+
+                                        __
+                 ,-_                  (`  ).
+                 |-_'-,              (     ).
+                 |-_'-'           _(        '`.
+        _        |-_'/        .=(`(      .     )
+       /;-,_     |-_'        (     (.__.:-`-_.'
+      /-.-;,-,___|'          `(       ) )
+     /;-;-;-;_;_/|\_ _ _ _ _   ` __.:'   )
+        x_( __`|_P_|`-;-;-;,|        `--'
+        |\ \    _||   `-;-;-'
+        | \`   -_|.      '-'
+        | /   /-_| `
+        |/   ,'-_|  \
+        /____|'-_|___\
+ _..,____]__|_\-_'|_[___,.._
+'                          ``'--,..,.      mic
+        
+
+
Privacy | About +
+ + diff --git a/go/core/coreassets/json/json.html.go b/go/core/coreassets/json/json.html.go new file mode 100755 index 0000000..8424723 --- /dev/null +++ b/go/core/coreassets/json/json.html.go @@ -0,0 +1,19 @@ + + + + JSON API + + + + +

Using JSON API

+

Use https://domain.com/json/ to get a JSON output of search results.

+ Example: https://domain.com/json/?q=test outputs results for the query 'test'.

+ Append the parameter &p=NUM to get the next page of results.

+ Example: https://domain.com/json/?q=test&p=2

+ Append the parameter &nsfw to include results that are marked as not safe for work. +

Terms of Use: +
1. Use this service at your own risk. +

+ + diff --git a/go/core/coreassets/json/results.json.go b/go/core/coreassets/json/results.json.go new file mode 100755 index 0000000..7556630 --- /dev/null +++ b/go/core/coreassets/json/results.json.go @@ -0,0 +1,9 @@ +[ +{{range $i, $e:=.DBResults}}{{if $i}}, +{{end}} { + "URL": "{{.Url}}", + "Title": "{{.Title}}", + "Snippet": "{{.Body}}", + "Description": "{{.Description}}" + }{{end}} +] diff --git a/go/core/coreassets/results.html.go b/go/core/coreassets/results.html.go new file mode 100755 index 0000000..5a6a009 --- /dev/null +++ b/go/core/coreassets/results.html.go @@ -0,0 +1,33 @@ + + + + + {{.Query}} + + + + + +
+
+ name   + + +
+

+
+


+ + {{range .DBResults}} +
+ {{ printf "%.150s" .Title}}

{{.Url}}

{{printf "%.180s" .Body}}
{{printf "%.180s" .Description}}

+
+ {{end}} + + {{if .FindMore }} +


Find more...
+ {{else}} +


That's everything I could find.
Help make me smarter by submitting a page.

+ {{end}} + + diff --git a/go/core/coreassets/settings/agree.html.go b/go/core/coreassets/settings/agree.html.go new file mode 100755 index 0000000..9190de0 --- /dev/null +++ b/go/core/coreassets/settings/agree.html.go @@ -0,0 +1,33 @@ + + + + + + + Adult Content Agreement + + + + + + + + +

Adult Content Agreement

+ + You have indicated that you do not want adult content filtered.
+ By clicking agree, you accept that you will not freak out over what could end up displayed in the search results.
+ We try to ensure content that is illegal does not get stored into the index. + If you are 18 years of age or older and agree to the terms, check the box and press Submit. +

+

+
+
I agree to the terms and conditions (check and submit)
+

+ Return to search + +
+ + + + diff --git a/go/core/coreassets/settings/gohome.html b/go/core/coreassets/settings/gohome.html new file mode 100755 index 0000000..033b9c6 --- /dev/null +++ b/go/core/coreassets/settings/gohome.html @@ -0,0 +1,9 @@ + + + + + + + + + diff --git a/go/core/coreassets/settings/settings.html.go b/go/core/coreassets/settings/settings.html.go new file mode 100755 index 0000000..f309527 --- /dev/null +++ b/go/core/coreassets/settings/settings.html.go @@ -0,0 +1,75 @@ + + + + + Settings + + + + + + + +
+
+ + {{ if .Worksafe }} + + {{else}} + + {{end}}  

+ + {{ if .FilterHTTPS }} + + {{else}} + + {{end}} +

*for old browsers


+
+ +

+ About +
+




Search Options:

+ "This is fun" (use quotes to find an exact match)
+ cats +tabby (finds 'cats', but 'tabby' must be found within)
+ cats -tabby (results must not contain the word tabby)
+ +cats +dogs (both must be found within the document)
+
+ !td tornado (find within the frame of one day)
+ !tw tornado (find within the frame of one week)
+ !tm tornado (find within the frame of one month)
+ !ty tornado (find within the frame of one year)
+
+ site:URL Lorem ipsum (limit search within a domain or URL)
+
+
+

Redirect Options:
+
+ !g Paris (Google Text Search)
+ !gi Paris (Google Images)
+ !gv Paris (Google Videos)
+ !gm Paris (Google Maps)
+
+ !b Paris (Bing Text Search)
+ !bi Paris (Bing Images)
+ !bv Paris (Bing Videos)
+ !bm Paris (Bing Maps)
+
+ You may also use '&' in place of '!'. +

+
+ + + + + + + diff --git a/go/core/coreassets/surprise.html.go b/go/core/coreassets/surprise.html.go new file mode 100755 index 0000000..73460c1 --- /dev/null +++ b/go/core/coreassets/surprise.html.go @@ -0,0 +1,10 @@ + + + + + + + + You asked for it! + + diff --git a/go/core/go.mod b/go/core/go.mod new file mode 100644 index 0000000..ab0edb3 --- /dev/null +++ b/go/core/go.mod @@ -0,0 +1,8 @@ +module mysql + +go 1.18 + +require ( + filippo.io/edwards25519 v1.1.0 // indirect + github.com/go-sql-driver/mysql v1.8.1 // indirect +) diff --git a/go/core/go.sum b/go/core/go.sum new file mode 100644 index 0000000..19dbcec --- /dev/null +++ b/go/core/go.sum @@ -0,0 +1,4 @@ +filippo.io/edwards25519 v1.1.0 h1:FNf4tywRC1HmFuKW5xopWpigGjJKiJSV0Cqo0cJWDaA= +filippo.io/edwards25519 v1.1.0/go.mod h1:BxyFTGdWcka3PhytdK4V28tE5sGfRvvvRV7EaN4VDT4= +github.com/go-sql-driver/mysql v1.8.1 h1:LedoTUt/eveggdHS9qUFC1EFSa8bU2+1pZjSRpvNJ1Y= +github.com/go-sql-driver/mysql v1.8.1/go.mod h1:wEBSXgmK//2ZFJyE+qWnIsVGmvmEKlqwuVSjsCm7DZg= diff --git a/html/about/button.gif b/html/about/button.gif new file mode 100755 index 0000000..87a6a75 Binary files /dev/null and b/html/about/button.gif differ diff --git a/html/about/guide.html b/html/about/guide.html new file mode 100755 index 0000000..88ecf84 --- /dev/null +++ b/html/about/guide.html @@ -0,0 +1,679 @@ + + + +Build your own Search Engine + + + + + +
+

Build Your Own Search Engine

+
(Wiby Install Guide)
+
+

+Overview +
+Installation +
+Controlling +
+Scaling + +

Overview

+Wiby is a search engine for the World Wide Web. The source code is now free as of July 8, 2022 under the GPLv2 license. I have been longing for this day! You can watch a quick demo here. +
+
+It includes a web interface allowing guardians to control where, how far, and how often it crawls websites and follows hyperlinks. The search index is stored inside of an InnoDB full-text index. +
+
+Fast queries are maintained by concurrently searching different sections of the index across multiple replication servers or across duplicate server connections, returning a list of top results from each connection, +then searching the combined list to ensure correct ordering. Replicas that fail are automatically excluded; new replicas are easy to include. +As new pages are crawled, they are stored randomly across the index, ensuring each search section can obtain relevant results.
+
+The search engine is not meant to index the entire web and then sort it with a ranking algorithm. +It prefers to seed its index through human submissions made by guests, or by the guardian(s) of the search engine. +
+
+The software is designed for anyone with some extra computers (even a Pi), to host their own search engine catering to whatever niche matters to them. The search engine includes a simple API +for meta search engines to harness. +
+
+I hope this will enable anyone with a love of computers to cheaply build and maintain a search engine of their own. +I hope it can cultivate free and independent search engines, ensuring accessibility of ideas and information across the World Wide Web. +
+
+
+
+       Web Traffic
+            |
+            |
++-----------+-----------+
+| Reverse Proxy (nginx) |
++-----------+-----------+
+            |
+            |
++-----------+-----------+
+|  Wiby Core Server(s)  |+-----------------+----------------------------+
+|(Golang or PHP version)|                  |                            |
++-----------+-----------+       +----------+----------+       +---------+---------+
+            |                   |Replication Databases|+-----+|Replication Tracker|
+            |                   +----------+----------+       +-------------------+
++-----------+-----------+                  |
+|    Primary Database   |+-----------------+
+|   (MySQL or MariaDB)  |
++----+-------------+----+
+     |             |  
+     |             |  
++----+-----+  +----+----+
+|   Web    |  | Refresh |
+|Crawler(s)|  |Scheduler|
++----------+  +---------+
+
+
+
+

Installation

+I can only provide manual install instructions at this time. +
+
+Note that while the software is functionally complete, it is still in beta. Anticipate that some bugs will be discovered now that the source is released. +Ensure that you isolate the search engine from your other important services, and if you are running parts of it out of your home, keep the servers +on a separate VLAN. Make sure this VLAN cannot access your router or switch interface. Continue this practise even when the software reaches "1.0". +
+
+If you have created a "LAMP", or rather a "LEMP" server before, this isn't much more complicated. If you've never done that, I suggest you find a "LEMP" tutorial. +

+

Build a LEMP server

+Digital Ocean tutorials are usually pretty good so here is a link to one for Ubuntu 20 and +Ubuntu 22. +
+
+For the sake of simplicity, assume all instructions are for Ubuntu 20 or 22. If you are on a different distro, modify the install steps accordingly to suit your distro. +
+
+If you don't have a physical server, you can rent computing space by looking for a "VPS provider". This virtual computer will be your reverse proxy, and if you want, it can host everything else too. +
+
+

Install the following additional packages:

+
apt install build-essential php-gd libcurl4-openssl-dev libmysqlclient-dev golang git
+
+

Get Wiby Source Files

+Download the source directly from Wiby here, or from GitHub. The source is released under the GPLv2 license. Copy the source files for Wiby to your server. +
+
+ +

Compile the crawler (cr), refresh scheduler (rs), replication tracker (rt):

+
+gcc cr.c -o cr -lmysqlclient -lcurl -std=c99 -O3
+gcc rs.c -o rs -lmysqlclient -std=c99 -O3
+gcc rt.c -o rt -lmysqlclient -std=c99 -O3
+
+If you get any compile errors, it is likely due to the path of the mysql or libcurl header files. +This could happen if you are not using Ubuntu. You might have to locate the correct path for curl.h, easy.h, mysql.h, then edit the #include paths in the source files. +
+
+

Build the core server application:

+The core application is located inside the go folder. Run the following commands after copying the files over to your preferred location: +
+For Ubuntu 20:
+go get -u github.com/go-sql-driver/mysql
+
+For Ubuntu 22 OR latest Golang versions:
+go install github.com/go-sql-driver/mysql@latest
+go mod init mysql
+go get github.com/go-sql-driver/mysql
+
+go build core.go
+go build 1core.go
+
+If you are just starting out, you can use '1core'. If you are going to setup replication servers or you are using a computer with a lot of available cores, you can use 'core', but make sure to read the scaling section. +
+
+If you want to use 1core on a server separate from your reverse proxy server, modify line 37 of 1core.go: replace 'localhost' with '0.0.0.0' so that it accepts connections over your VPN from your reverse proxy. +
+
+You can also use index.php in the root of the www directory and not use the Go version at all. Though the PHP version is used mainly for prototyping. +
+
+

Build the Primary Database:

+Make sure these lines are inside of /etc/mysql/my.cnf, then restart mysql +
+[client]
+default-character-set=utf8mb4
+
+[mysql]
+default-character-set = utf8mb4
+
+[mysqld]
+max_connections = 2000
+ft_min_word_len=1
+sql_mode = "NO_BACKSLASH_ESCAPES"
+character-set-server = utf8mb4
+collation-server = utf8mb4_unicode_520_ci
+innodb_ft_enable_stopword=0
+skip-character-set-client-handshake
+default-authentication-plugin=mysql_native_password
+wait_timeout = 800
+
+#memory use settings, you should adjust this based on your hardware
+innodb_buffer_pool_size = 1342177280
+innodb_buffer_pool_instances = 2
+innodb_flush_method = O_DIRECT
+
+
+Login to MySQL and type: +
+create database wiby;
+create database wibytemp;
+
+Import the wiby and wibytemp database files: +
+mysql -u root -p wiby < wiby.sql
+mysql -u root -p wibytemp < wibytemp.sql
+
+Login to MySQL, create the following accounts and give them the correct access:
+
+create user 'guest'@'localhost' identified by 'qwer';
+create user 'approver'@'localhost' identified by 'foobar';
+create user 'crawler'@'localhost' identified by 'seekout';
+create user 'remote_guest'@'localhost' identified by 'd0gemuchw0w';
+use wiby;
+grant select on accounts to 'approver'@'localhost';
+grant select on reviewqueue to 'approver'@'localhost';
+grant insert on indexqueue to 'approver'@'localhost';
+grant delete on reviewqueue to 'approver'@'localhost';
+grant update on reviewqueue to 'approver'@'localhost';
+grant select on indexqueue to 'crawler'@'localhost';
+grant insert on windex to 'crawler'@'localhost';
+grant insert on indexqueue to 'crawler'@'localhost';
+grant update on windex to 'crawler'@'localhost';
+grant delete on indexqueue to 'crawler'@'localhost';
+grant delete on windex to 'crawler'@'localhost';
+grant select on windex to 'crawler'@'localhost';
+grant insert on reviewqueue to 'crawler'@'localhost';
+grant select on windex to 'guest'@'localhost';
+grant insert on reviewqueue to 'guest'@'localhost';
+grant insert on feedback to 'guest'@'localhost';
+grant select on feedback to 'approver'@'localhost';
+grant delete on feedback to 'approver'@'localhost';
+grant insert on graveyard to 'approver'@'localhost';
+grant update on graveyard to 'approver'@'localhost';
+grant delete on graveyard to 'approver'@'localhost';
+grant select on graveyard to 'approver'@'localhost';
+grant update on accounts to 'approver'@'localhost';
+grant insert on accounts to 'approver'@'localhost';
+grant delete on accounts to 'approver'@'localhost';
+grant select on ws0 to 'crawler'@'localhost';
+grant update on ws0 to 'crawler'@'localhost';
+grant insert on ws0 to 'crawler'@'localhost';
+grant delete on ws0 to 'crawler'@'localhost';
+grant select on ws1 to 'crawler'@'localhost';
+grant update on ws1 to 'crawler'@'localhost';
+grant insert on ws1 to 'crawler'@'localhost';
+grant delete on ws1 to 'crawler'@'localhost';
+grant select on ws2 to 'crawler'@'localhost';
+grant update on ws2 to 'crawler'@'localhost';
+grant insert on ws2 to 'crawler'@'localhost';
+grant delete on ws2 to 'crawler'@'localhost';
+grant select on ws3 to 'crawler'@'localhost';
+grant update on ws3 to 'crawler'@'localhost';
+grant insert on ws3 to 'crawler'@'localhost';
+grant delete on ws3 to 'crawler'@'localhost';
+grant select on windex to 'remote_guest'@'localhost';
+grant select on ws0 to 'remote_guest'@'localhost';
+grant select on ws1 to 'remote_guest'@'localhost';
+grant select on ws2 to 'remote_guest'@'localhost';
+grant select on ws3 to 'remote_guest'@'localhost';
+use wibytemp;
+grant select on titlecheck to 'crawler'@'localhost';
+grant insert on titlecheck to 'crawler'@'localhost';
+grant delete on titlecheck to 'crawler'@'localhost';
+grant select on rejected to 'approver'@'localhost';
+grant insert on rejected to 'approver'@'localhost';
+grant delete on rejected to 'approver'@'localhost';
+grant insert on rejected to 'crawler'@'localhost';
+grant select on reserve_id to 'crawler'@'localhost';
+grant insert on reserve_id to 'crawler'@'localhost';
+grant delete on reserve_id to 'crawler'@'localhost';
+grant select on crawled to 'crawler'@'localhost';
+grant insert on crawled to 'crawler'@'localhost';
+grant delete on crawled to 'crawler'@'localhost';
+FLUSH PRIVILEGES;
+
+

Copy the HTML files and PHP scripts to your web server

+
Copy the contents of the the html directory into the nginx html directory (/var/www/html)
+ +

Configure nginx for Wiby

+In /etc/nginx/, create a directory called 'phpcache', and another one called 'cache'. +
+Instead of going through every detail, I will provide a template for you to try out as your default nginx config from inside /etc/nginx/sites-available/ of the source code. +New nginx versions depricated /sites-available, so you might have to place the template inside /etc/nginx/conf.d instead. +
+
+You should learn nginx configuration on your own, this template is just to assist. +If you are using only the php version, comment all "core app" location entries to revert Wiby search to the php only version. +
+Make sure ssl_certificate and ssl_certificate_key have the path for your SSL files instead of the example paths. If you don't want to use SSL, just remove the server {} configuration for SSL connections (on port 443). +Also the example file references php7.4-fpm.sock, so if you are using a different version remember to update that as well (such as php8.1-fpm.sock on Ubuntu 22). +
+
+

Start the Refresh Scheduler

+This program (rs) will make sure all pages indexed are refreshed at least once per week (or sooner depending on how you assign updates to an individual website). +You may want to run this on startup, easiest way to set that is with a cron job (crontab -e). Run './rs -h' to get more parameters and info needed to run multiple crawlers. +To start manually: 'nohup ./rs &' then press ctrl-c. +
+
+

Start the Crawler

+It is best to run the crawler in a Screen session so that you can monitor its output. You can have more than one crawler running as long as you keep them in separate directories, include symlinks to the same robots folder and 'shards' file, and also set the correct parameters on each. +To view the parameters, type './cr -h'. Without any parameters set, you can only run one crawler (which might be all you need anyway). If necessary, you can change the database connection from 'localhost' to a different IP from inside cr.c, then rebuild. +
+
+If using more than one crawler, update the variable '$num_crawlers' from inside of review.php and graveyard.php (line 73) to the number of crawlers you are using. +
+
+Note that you may need to change the crawler's user-agent (CURLOPT_USERAGENT in cr.c and checkrobots.h) if you have issues indexing some websites. Pages that fail to index are noted inside of abandoned.txt. +
+
+Make sure the robots folder exists, or create one in the same directory as the crawler. All robots.txt files are stored in the robots folder. They are downloaded once and then referenced from that folder on future updates. Clear this folder every few weeks to ensure robots.txt files get refreshed from time to time. You can also create custom robots.txt files for specific domains and store them there for the crawler to reference. +To disable checking for robots.txt files, comment out the line calling the "checkrobots" function inside of cr.c. +
+
+If crawling through hyperlinks on a page, the following file types are accepted: html, htm, xhtml, shtml, txt, php, asp. Links containing parameters are ignored. These limitations do not apply to pages directly submitted by people. +
+
+

Start the Replication Tracker

+The tracker (rt) should run in the same directory that you will run the core server on. You do not need this if running 1core or the PHP only version. You can use a cron job to run it on startup, or +start it manually with this command: 'nohup ./rt &' then press ctrl-c. +
+
+

Start the Core Server

+You can run the core server on startup with a cron job, or start it manually with this command: 'nohup ./core &' then press ctrl-c. +
+
+If you are just starting out, '1core' or the php version is easiest to start with. Use 'core' if you want to scale computer resources as the index grows or if you have at least four available CPU cores. It is recommended you use 'core' as it makes better use of your CPU, but make sure to read the scaling section. +
+
+If you want to use 1core on a server separate from your reverse proxy server, modify line 37 of 1core.go: replace 'localhost' with '0.0.0.0' so that it accepts connections over your VPN from your reverse proxy. +
+
+

Set Administrator Password for the Web Interface

+There is no default web login, you will have to set this manually the first time: +
+Rename the /html/hash folder to something private.
+
+Edit html/private_folder_name/hashmake.php and change 'secretpassword' to your preferred admin password. 
+
+Access /private_folder_name/hashmake.php from your browser and copy down the hash.
+
+After you have copied it down, delete or remove hashmake.php from your web server folder so that the hash cannot be discovered.
+
+Login to MySQL and create the account: +
+use wiby;
+INSERT INTO accounts (name,hash,level) VALUES('your_username','your_password_hash','admin');
+
+You can now access /accounts/ from your browser, login to create and manage all accounts for administrators and guardians of the search engine. +
+
+admin - Can access all web forms for the search engine and use the /accounts/ page to create and delete accounts. +
+
+guardian - The main role of a guardian is to gatekeep the index of the search engine. Can access all forms except for /readf/, and can only use the /accounts/ page to change their own password. +
+
+
+
+

Controlling the Search Engine

+
+There are several forms to control the search engine. There is no central form linking everything together, just a collection of different folders that you can rename if you want. +
+
+

/submit/

This public facing form allows users of the search engine to submit websites for indexing, provided they comply with your submission criteria, which you can modify on /submit/form.html.php. +
+
+

/accounts/

+This is the account management page. Admins have options to create, lock, change account type, delete, and reset passwords. Guardians have the option to change their password. +
+
+

/review/

This is the most important form, intended for you to verify website submissions meet your criteria. Up to 10 pages are assigned to each guardian or admin that accesses the form. The pages will remain assigned to that account for up to 30 minutes. +From here you can control how much, how deep, and how often the web crawler will access each submission. Here is an example of the available options for a website submission: +
+
+url_that_was_submitted +
+[Worksafe] + [Surprise] + [Skip] + [Bury] + [Deny] + [Updatable] +
+ [Crawl: Depth + Pages + Type + Enforce Rules + Repeat] +
+
+Explanation of the above options: +
+
+Worksafe - Indicates if the website is safe for work. Set by the user who submitted the website, however you can change it based on your determination. +
+
+Surprise - Checking this box will put it in the "surprise me" feature, where users get redirected to random websites when they click "surprise me". Note that this feature won't show NSFW websites even if they are set to surprise. +
+
+Skip - Selecting this option will skip indexing the page and it will reappear on the review form after you submit the rest of the pages for crawling. +
+
+Bury - Selecting this will move the page to a graveyard (/grave/), a holding place with the same options as /review/ for websites that might have stopped working but that you suspect may come back online. The crawler will detect this automatically and send the page back into review. When you click on the link and see a 404, you can be assured the crawler sent it back to review after failing two update cycles. This also happens if the title of the page changes. The crawler will only do this for pages directly submitted by people. This curtesy is not given to websites that are automatically crawled but then fail to work later on. For those sites, after two failed update cycles, the page will be removed. +
+
+Deny - Select this to drop the page from being indexed. If the page does not meet your submission criteria, this would be the option to remove it from the queue. +
+
+Updatable - The update cycle for the web crawler to return to the page. This only applies to pages submitted by people, pages found by link crawling always go on a 1 week update cycle. +
+
+------------------- Crawl ------------------- +
+The options listed below control how the crawler indexes hyperlinks on the website. By default, the crawler does not index any hyperlinks, it will only index the page that is submitted. +
+
+Depth - How many layers of links to crawl through. You must set at least a depth of 1 if you want to crawl any hyperlinks. Setting a negative value = no limit. Be careful about that. +
+
+Pages - How many pages to crawl on each link layer (depth). They will be randomly selected. You must set at least 1 if you want to crawl any hyperlinks. Setting a negative value = no limit. Be careful about that. +
+
+Type - Indicates if you want to only crawl links local to the website, or links external to the website, or both. +
+
+Enforce rules - This is a blunt tool that checks if pages have more than two scripts and/or css files. If the limit is exceded, the page will not be indexed. I don't use it and prefer to manually check based on more forgiving criteria. +
+
+Repeat - While the crawler will always return to update each page in the index, it wont crawl through hyperlinks again unless you tell it to. Even so, it only crawls hyperlinks on the page at a depth of 1 when repeat is selected. +
+
+

/ban/

+Delete or ban a list of URLs from the index with this form. You can't delete an entire domain with it, for that you can build your own query in the MySQL console. +
+
+

/bulksubmit/

+Admins/Guardians can import a list of URLs into the review queue with this form. +
+
+

/feedback/

+Users can submit feedback for you with this form. +
+
+

/readf/

+Where admin accounts can read feedback submitted by users. +
+
+

/grave/

+It has the same features as /review/. Websites that you don't yet want to index but don't want to forget about are stored inside /grave/ by selecting 'bury' from inside /review/. The web crawler will (only for pages submitted directly by people), move 404'd pages or pages where the title has changed back to /review/ after two update cycles +where the page does not return to normal. So after a few weeks you may notice dead pages appearing in /review/, you can decide to drop the page or to bury it where it will be moved to /grave/. The page might go back to normal at some point and you can check /grave/ to see if it resurrects. +
+
+

/insert/

+This was the first form created back in late 2016 to populate the Wiby index and see if the search engine could even work as a proof of concept. It was meant to manually enter pages into the index as no crawler existed yet. +It is still useful if you want to manually index a page that refuses to permit the crawler to access it. In that case, set updatable to 0. +
+
+

/tags/

+If you want to force a website to appear at the top rank for specific single word queries (like "weather"), you can force it by tagging the words to the target url. +
+
+

/json/

+This is the JSON API developers can use to connect their services to the search engine. Instructions are located at that location. +
+
+

Additional Notes

+If you need to stop the web crawler in a situation where it was accidently queued to index an unlimited number of pages, first stop the crawler program, truncate the indexqueue table 'truncate indexqueue;', then restart the crawler. +
+
+
+
+

Scaling the Search Engine

+
+You can help ensure sub-second search queries as your index grows by building MySQL replica servers on a local network close to each other, run the core application AND replication tracker (rt) in the same directory on one or more full-replica servers and point your reverse proxy to use it. Edit the servers.csv file for rt to indicate all available replica IPs and available shard tables (ws0 to wsX). Four are already preconfigured. +
+
+If you have a machine with at least four CPU cores, entering multiple duplicate entries to the same sever inside servers.csv (e.g. one for each CPU core) works also. By default, four duplicate connections are already set to use your existing machine. +
+
+The core application checks the replication tracker (rt) output to determine if any replicas or duplicate connections are available, it will initiate a connection on those replicas and task each one to search a different shard table, drastically speeding up search speeds. +
+
+The search results per page limit is 12, and should evenly divide 'into' OR 'by' the total number of replicas/shards defined in servers.csv. You don't need to restart the tracker when editing servers.csv. + As an example, if you have three computers with a 4-core CPU on each, you can create up to 12 shard tables, then point the tracker to use 4 shards on each computer for maximum use. Another option would be to keep the default four shard and four duplicate connection configuration, host the core application and rt on each computer, and use nginx to load balance traffic between them. +
+
+The reverse proxy and replica servers can be connected through a VPN such as wireguard or openvpn, however the IPs for servers.csv should be the local IPs for the LAN +the replicas are all connected on. See the instructions to setup a MySQL replica, and here is a longer tutorial on MySQL replicas should you need more info. +
+
+Indicate the number of shards in the 'shards' file that the crawler references (four are already preconfigured). If for some reason you need to rebuild/rebalance the shard tables, see the directions here. To create more shard tables, see this section. If for some reason you only want to host specific shard tables on a replica, you can use replication filtering. +
+
+
+Instructions for Building a MySQL Replica: +
+
+On the primary server add these lines to my.cnf under [mysqld] but only once you have a VPN to reach your replicas. Replace my.vpn.ip with your own, then restart MySQL. +
+#setting up replication below
+bind-address = 127.0.0.1,my.vpn.ip
+server-id = 1
+log_bin = /var/log/mysql/mysql-bin.log
+binlog_do_db = wiby
+binlog_format = mixed
+
+In MySQL on the primary server, create a user for replica access, replace the IP 10.0.0.% to that for your own VPN IP and allowed subnet: +
+create user 'slave_user'@'10.0.0.%' identified by 'd0gemuchw0w';
+GRANT REPLICATION SLAVE ON *.* TO 'slave_user'@'%';
+FLUSH PRIVILEGES;
+
+On the replica server, after installing MySQL, ensure the following my.cnf configuration, set the server-id as a unique id for each replica, then restart MySQL: +
+[client]
+default-character-set=utf8mb4
+
+[mysql]
+default-character-set = utf8mb4
+
+[mysqld]
+max_connections = 2000
+ft_min_word_len=1
+sql_mode = "NO_BACKSLASH_ESCAPES"
+#character-set-client-handshake = FALSE
+character-set-server = utf8mb4
+collation-server = utf8mb4_unicode_520_ci
+innodb_ft_enable_stopword=0
+skip-character-set-client-handshake
+default-authentication-plugin=mysql_native_password
+wait_timeout = 800
+
+#memory use settings, you should adjust this based on your hardware
+innodb_buffer_pool_size = 1342177280
+innodb_buffer_pool_instances = 2
+innodb_flush_method = O_DIRECT
+
+#setting up replication below
+bind-address = 0.0.0.0
+server-id = 2
+relay_log_info_repository = TABLE
+relay_log_recovery = ON
+sync_binlog=1
+
+Make sure only VPN and VLAN addresses can reach your replicas. The bind address of 0.0.0.0 can be replaced with '127.0.0.1,replica.vpn.ip' which is safer but also more crash prone if the VPN address is not available on startup. +
+
+To export the database to the replica server, on the primary server, stop the web crawler and hide any web forms that can accept new data, then open MySQL and do the following. +
+USE wiby;
+FLUSH TABLES WITH READ LOCK;
+SHOW MASTER STATUS;
+
++------------------+----------+--------------+------------------+-------------------+
+| File             | Position | Binlog_Do_DB | Binlog_Ignore_DB | Executed_Gtid_Set |
++------------------+----------+--------------+------------------+-------------------+
+| mysql-bin.000055 | 15871269 | wiby         |                  |                   |
++------------------+----------+--------------+------------------+-------------------+
+
+Keep the above session window open (or run it in a screen session). +
+Copy down the information from that table. In a separate session window, export the database: +
+mysqldump -u root -p wiby > wiby.sql
+
+Once you have exported the database and recorded what you need, you can unlock the tables, and resume as normal. On the session window displaying the master status: +
+UNLOCK TABLES;
+
+You can now close that window if you want. +
+
+On the replica server, login to MySQL and create the database: +
+CREATE DATABASE wiby;
+EXIT;
+
+Import the database: +
+mysql -u root -p wiby < wiby.sql
+
+Login to MySQL and type the following but replace the primary_server_ip, MASTER_LOG_FILE, and MASTER_LOG_POS with yours from the table: +
+CHANGE MASTER TO MASTER_HOST='primary_server_ip',MASTER_USER='slave_user', MASTER_PASSWORD='d0gemuchw0w', MASTER_LOG_FILE='mysql-bin.000055', MASTER_LOG_POS=15871269;
+START SLAVE;
+
+To verify that the replica is syncronized, type the following on the replica in MySQL: +
+SHOW SLAVE STATUS\G
+
+Make sure that: +
+Slave_IO_Running: Yes
+Slave_SQL_Running: Yes
+
+In MySQL on the replica, create the accounts required for the replication tracker and core application. +Note that the remote_guest account will allow connections from any outside machine. Make sure your replica is protected behind a firewall.
+
+use wiby;
+create user 'guest'@'localhost' identified by 'qwer';
+grant select on windex to 'guest'@'localhost';
+create user 'remote_guest'@'%' identified by 'd0gemuchw0w';
+grant select on windex to 'remote_guest'@'%';
+grant select on ws0 to 'remote_guest'@'%';
+grant select on ws1 to 'remote_guest'@'%';
+grant select on ws2 to 'remote_guest'@'%';
+grant select on ws3 to 'remote_guest'@'%';
+create user 'crawler'@'localhost' identified by 'seekout';
+FLUSH PRIVILEGES;
+
+To update the host for any account, do the following:
+
+use mysql;
+Select user,host from user;
+RENAME USER 'username'@'oldhost' TO 'username'@'newhost';
+
+
+Creating More Shard Tables +
+There are four shard tables already in the database, but if you need more: +
+
+Stop the crawler and update the number in the 'shards' file, then copy a shard table entry (wsX) from the wiby.db template file, renaming it in the proper number sequence, and paste that into the mysql console on the primary database. +
+
+Make sure to give access to the new shard tables. +
+
+You will need to rebalance the shards, follow the steps below, then restart the crawler. Going forward it will round-robin insert into those shards as new pages are crawled. +
+
+
+Accessing Additional Shards +
+Apply the account access permissions listed here for core app and rt access to each replica and here for crawler access to each new shard table on the primary server or replica hosting the core app. +
+
+
+Balancing Additional Shards +
+For now you would have to manually rebalance shards when adding new ones. The most straight-forward way to rebalance them is to: +
+
+Update 'servers.csv' with the additional shard connections being used. +
+
+Stop the crawler and update 'shards' with the new total of shards being used. +
+
+Start up rt, then copy down the id numbers referenced for each connection. +
+
+Truncate all the shard tables on the primary: +
+truncate ws0; truncate ws1; etc..
+
+Repopulate the 1st shard table (and so on), on the primary server: +
+"UPDATE windex SET shard = 0 WHERE id BETWEEN 0 AND 5819;" replacing those id numbers with those indicated by rt. 
+"INSERT INTO ws0 SELECT * FROM windex WHERE id BETWEEN 0 AND 5819;" replacing those id numbers with those indicated by rt. 
+
+Repeat those steps for each shard table.
+
+These changes will propagate down to the replicas, and the core application will be able to use them as long as permissions to those tables were added. +
+
+
+

Load Balancing

+You should run the core application on one or more of your replicas and have nginx send traffic to it, this way you can reduce the burden on your VPS. The replication tracker (rt) must run on the same server and directory that the core application is running on (not required for 1core). +
+
+Add the replica server's VPN address/port to upstream remote_core {} from the default config for nginx (see the provided example template). You can use the VPS as a backup instead by adding 'backup' to its address (eg: server 127.0.0.1:8080 backup;) +
+
+

Additional Notes

+The crawler stores a maximum of 80KB worth of text from the body of each webpage. To change this limit, edit the "body_len" definition from inside htmlparse.h and recompile the crawler. +This will affect the total size of the index and overall search speeds. +

+
+ + diff --git a/html/about/index.html b/html/about/index.html new file mode 100755 index 0000000..13ba936 --- /dev/null +++ b/html/about/index.html @@ -0,0 +1,31 @@ + + + +About + + + + +
+

Why OoHay?

+

+We are a search engine for the amazing network known as Compu-Global-Hyper-Mega-Net, which you are using to view this page right now! OoHay connects people to websites, by allowing them to search for the site they're looking for. +
+
+OoHay is a customized instance of Wiby. +
+
+ +
+
+Send Feedback

+

+

Additional Features: +

Developers can connect their applications using the JSON output available at /json. +
+
+Privacy Policy +

+
+ + diff --git a/html/about/oohay.gif b/html/about/oohay.gif new file mode 100644 index 0000000..5c4ecb0 Binary files /dev/null and b/html/about/oohay.gif differ diff --git a/html/about/pp.html b/html/about/pp.html new file mode 100755 index 0000000..b2e4c35 --- /dev/null +++ b/html/about/pp.html @@ -0,0 +1,13 @@ + + + +Privacy Policy + + + + +

Privacy Policy

+ Raw access logs are kept for up to 48 hours and automatically deleted beyond such time.
Raw access logs existing within the 48 hour period are private; they are not shared with third parties. +

+ + diff --git a/html/about/wiby.gif b/html/about/wiby.gif new file mode 100755 index 0000000..87a6a75 Binary files /dev/null and b/html/about/wiby.gif differ diff --git a/html/about/wiby.org.gif b/html/about/wiby.org.gif new file mode 100755 index 0000000..69fdcf6 Binary files /dev/null and b/html/about/wiby.org.gif differ diff --git a/html/about/wibyplex.gif b/html/about/wibyplex.gif new file mode 100755 index 0000000..cd5a5d1 Binary files /dev/null and b/html/about/wibyplex.gif differ diff --git a/html/accounts/accounts.html.php b/html/accounts/accounts.html.php new file mode 100755 index 0000000..08e5852 --- /dev/null +++ b/html/accounts/accounts.html.php @@ -0,0 +1,25 @@ + + + + + + + Form Example + + + + + + + +

+ + + +

+ + + + diff --git a/html/accounts/accounts.php b/html/accounts/accounts.php new file mode 100755 index 0000000..3a9496d --- /dev/null +++ b/html/accounts/accounts.php @@ -0,0 +1,249 @@ + + + diff --git a/html/accounts/admin.html.php b/html/accounts/admin.html.php new file mode 100755 index 0000000..10f3e2b --- /dev/null +++ b/html/accounts/admin.html.php @@ -0,0 +1,42 @@ + + + + + Account Management + + + + + +
+
+
+ Username
+ Password 
+ Level        
+ Action      

+
+
+
Accounts:
+ + +

+ + + +
+ + diff --git a/html/accounts/error.html.php b/html/accounts/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/accounts/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/accounts/guardian.html.php b/html/accounts/guardian.html.php new file mode 100755 index 0000000..9cafb93 --- /dev/null +++ b/html/accounts/guardian.html.php @@ -0,0 +1,25 @@ + + + + + Account Management + + + + + +
+
+ Update Password 
+
+
+
+ + + diff --git a/html/accounts/index.php b/html/accounts/index.php new file mode 100755 index 0000000..e74c5d2 --- /dev/null +++ b/html/accounts/index.php @@ -0,0 +1,98 @@ + 'Strict']); + session_start(); + + if ( !isset($_POST['pass']) || !isset($_POST['user'])) + { + include 'login.html.php'; + } + else if( $_POST['user'] == '' || $_POST['pass'] == '') + { + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + else + { + if(!isset($_SESSION["authenticated"])) + { + include_once $_SERVER['DOCUMENT_ROOT'] . '/securimage/securimage.php'; + $securimage = new Securimage(); + if ($securimage->check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'login.html.php'; + exit(); + + } + } + + $link = mysqli_connect('localhost', 'approver', 'foobar'); + $user = mysqli_real_escape_string($link, $_POST['user']); + $pass = mysqli_real_escape_string($link, $_POST['pass']); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + $loginresult = mysqli_query($link,"SELECT hash, attempts, level FROM accounts WHERE name = '$user';"); + if(!$loginresult) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + //lets put contents of accounts into an array + while($rowaccounts = mysqli_fetch_array($loginresult)) + { + $hash[] = $rowaccounts['hash']; + $attempts[] = $rowaccounts['attempts']; + $level[] = $rowaccounts['level']; + } + if(password_verify($pass,$hash[0]) && $attempts[0] < 5) + { + if($attempts[0]>0) + { + if (!mysqli_query($link, "UPDATE accounts SET attempts = '0' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + $_SESSION["authenticated"] = true; + $_SESSION["user"] = $user; + $_SESSION["level"] = $level[0]; + include 'accounts.php'; + exit(); + } + else{ + $attempt = $attempts[0] + 1; + if (!mysqli_query($link, "UPDATE accounts SET attempts = '$attempt' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + } +?> + diff --git a/html/accounts/login.html.php b/html/accounts/login.html.php new file mode 100755 index 0000000..18aa986 --- /dev/null +++ b/html/accounts/login.html.php @@ -0,0 +1,31 @@ + + + + + + + wiby.me + + + + + + + +
+ Username
+ Password

+ +
+ CAPTCHA Image +
+
+ + Reload Image +
+ +
+
+ + + diff --git a/html/ban/ban.html.php b/html/ban/ban.html.php new file mode 100755 index 0000000..aea6652 --- /dev/null +++ b/html/ban/ban.html.php @@ -0,0 +1,25 @@ + + + + + + + Result + + + + + + + +

+ Completed.

+ + +

+ Return + + + diff --git a/html/ban/ban.php b/html/ban/ban.php new file mode 100755 index 0000000..8f92048 --- /dev/null +++ b/html/ban/ban.php @@ -0,0 +1,150 @@ + diff --git a/html/ban/error.html.php b/html/ban/error.html.php new file mode 100755 index 0000000..311ea77 --- /dev/null +++ b/html/ban/error.html.php @@ -0,0 +1,30 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + +
+
+ Note: An SQL error will occur if the page is not typed exactly as seen in the search results. +
+
+ Return +

+ + + + diff --git a/html/ban/form.html.php b/html/ban/form.html.php new file mode 100755 index 0000000..4a54950 --- /dev/null +++ b/html/ban/form.html.php @@ -0,0 +1,33 @@ + + + + + Ban a page + + + + + +
+
+
+ + + + + +
+
+ + +

*Unchecking this will ban them from the index instead.


+
+
+
+ + diff --git a/html/ban/index.php b/html/ban/index.php new file mode 100755 index 0000000..75bd432 --- /dev/null +++ b/html/ban/index.php @@ -0,0 +1,96 @@ + 'Strict']); + session_start(); + + if ( !isset($_POST['pass']) || !isset($_POST['user'])) + { + include 'login.html.php'; + } + else if( $_POST['user'] == '' || $_POST['pass'] == '') + { + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + else + { + if(!isset($_SESSION["authenticated"])) + { + include_once $_SERVER['DOCUMENT_ROOT'] . '/securimage/securimage.php'; + $securimage = new Securimage(); + if ($securimage->check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'login.html.php'; + exit(); + + } + } + + $link = mysqli_connect('localhost', 'approver', 'foobar'); + $user = mysqli_real_escape_string($link, $_POST['user']); + $pass = mysqli_real_escape_string($link, $_POST['pass']); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + $loginresult = mysqli_query($link,"SELECT hash, attempts FROM accounts WHERE name = '$user';"); + if(!$loginresult) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + //lets put contents of accounts into an array + while($rowaccounts = mysqli_fetch_array($loginresult)) + { + $hash[] = $rowaccounts['hash']; + $attempts[] = $rowaccounts['attempts']; + } + if(password_verify($pass,$hash[0]) && $attempts[0] < 5) + { + if($attempts[0]>0) + { + if (!mysqli_query($link, "UPDATE accounts SET attempts = '0' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + $_SESSION["authenticated"] = true; + $_SESSION["user"] = $user; + include 'ban.php'; + exit(); + } + else{ + $attempt = $attempts[0] + 1; + if (!mysqli_query($link, "UPDATE accounts SET attempts = '$attempt' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + } +?> + diff --git a/html/ban/login.html.php b/html/ban/login.html.php new file mode 100755 index 0000000..18aa986 --- /dev/null +++ b/html/ban/login.html.php @@ -0,0 +1,31 @@ + + + + + + + wiby.me + + + + + + + +
+ Username
+ Password

+ +
+ CAPTCHA Image +
+
+ + Reload Image +
+ +
+
+ + + diff --git a/html/bulksubmit/bulksubmit.php b/html/bulksubmit/bulksubmit.php new file mode 100755 index 0000000..31668cc --- /dev/null +++ b/html/bulksubmit/bulksubmit.php @@ -0,0 +1,102 @@ + diff --git a/html/bulksubmit/error.html.php b/html/bulksubmit/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/bulksubmit/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/bulksubmit/form.html.php b/html/bulksubmit/form.html.php new file mode 100755 index 0000000..2a917da --- /dev/null +++ b/html/bulksubmit/form.html.php @@ -0,0 +1,42 @@ + + + + + + Bulk submit to the Wiby Web + + + + + + +
+ +
+ +
+ + + + + + + +
+ +
+ + +

+
+
+
+ + + + diff --git a/html/bulksubmit/index.php b/html/bulksubmit/index.php new file mode 100755 index 0000000..cac9de6 --- /dev/null +++ b/html/bulksubmit/index.php @@ -0,0 +1,97 @@ + 'Strict']); + session_start(); + + if ( !isset($_POST['pass']) || !isset($_POST['user'])) + { + include 'login.html.php'; + } + else if( $_POST['user'] == '' || $_POST['pass'] == '') + { + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + else + { + if(!isset($_SESSION["authenticated"])) + { + include_once $_SERVER['DOCUMENT_ROOT'] . '/securimage/securimage.php'; + $securimage = new Securimage(); + if ($securimage->check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'login.html.php'; + exit(); + + } + } + + $link = mysqli_connect('localhost', 'approver', 'foobar'); + $user = mysqli_real_escape_string($link, $_POST['user']); + $pass = mysqli_real_escape_string($link, $_POST['pass']); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + $loginresult = mysqli_query($link,"SELECT hash, attempts FROM accounts WHERE name = '$user';"); + if(!$loginresult) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + //lets put contents of accounts into an array + while($rowaccounts = mysqli_fetch_array($loginresult)) + { + $hash[] = $rowaccounts['hash']; + $attempts[] = $rowaccounts['attempts']; + } + if(password_verify($pass,$hash[0]) && $attempts[0] < 5) + { + if($attempts[0]>0) + { + if (!mysqli_query($link, "UPDATE accounts SET attempts = '0' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + $_SESSION["authenticated"] = true; + $_SESSION["user"] = $user; + $_SESSION["loadreview"]=true; + include 'bulksubmit.php'; + exit(); + } + else{ + $attempt = $attempts[0] + 1; + if (!mysqli_query($link, "UPDATE accounts SET attempts = '$attempt' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + } +?> + diff --git a/html/bulksubmit/login.html.php b/html/bulksubmit/login.html.php new file mode 100755 index 0000000..18aa986 --- /dev/null +++ b/html/bulksubmit/login.html.php @@ -0,0 +1,31 @@ + + + + + + + wiby.me + + + + + + + +
+ Username
+ Password

+ +
+ CAPTCHA Image +
+
+ + Reload Image +
+ +
+
+ + + diff --git a/html/bulksubmit/submit.html.php b/html/bulksubmit/submit.html.php new file mode 100755 index 0000000..8d4fb91 --- /dev/null +++ b/html/bulksubmit/submit.html.php @@ -0,0 +1,28 @@ + + + + + + + Completed Submission + + + + + + + + +

+ + +

+ Return to bulk submission page +

+ Return to wiby + +

+ + + + diff --git a/html/error.html.php b/html/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/favicon.ico b/html/favicon.ico new file mode 100755 index 0000000..51386de Binary files /dev/null and b/html/favicon.ico differ diff --git a/html/feedback/error.html.php b/html/feedback/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/feedback/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/feedback/form.html.php b/html/feedback/form.html.php new file mode 100755 index 0000000..d9a7e70 --- /dev/null +++ b/html/feedback/form.html.php @@ -0,0 +1,41 @@ + + + + + + Wiby Feedback Form + + + + + + +
+ +
+ +
+ + + + + + + +
+
+
+ CAPTCHA Image +
+
+ + Reload Image +

* Cookies must be enabled for the captcha.

+

+
+ +
+ + + + diff --git a/html/feedback/index.php b/html/feedback/index.php new file mode 100755 index 0000000..a4d717f --- /dev/null +++ b/html/feedback/index.php @@ -0,0 +1,73 @@ +check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'form.html.php'; + exit(); + + } + + + $link = mysqli_connect('localhost', 'guest', 'qwer'); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + + //$feedback = str_replace("\'", "\'\'", $_POST['feedback']); //single quotes must be handled correctly + //$feedback = str_replace("\"", "\"\"", $feedback);//double quotes must be handled correctly + $feedback = mysqli_real_escape_string($link, $_POST['feedback']); + + $feedback = substr($feedback,0,8000); //don't allow user to post a longer string than 8k (also limited in form) + + + $sql = "INSERT INTO feedback (message) VALUES ('".$feedback."')"; + + + if (!mysqli_query($link, $sql)) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + //Send thank you message which includes feedback + $output = htmlspecialchars($_POST['feedback'], ENT_QUOTES, 'UTF-8'); + + include 'submit.html.php'; + } +?> + + diff --git a/html/feedback/submit.html.php b/html/feedback/submit.html.php new file mode 100755 index 0000000..2165415 --- /dev/null +++ b/html/feedback/submit.html.php @@ -0,0 +1,36 @@ + + + + + + + Completed Feedback Submission + + + + + + + + +

+ Thank you for submitting feedback:


+
+


+ Return to About page +

+ Return to Wiby + +

+ + + + diff --git a/html/form.html.php b/html/form.html.php new file mode 100755 index 0000000..38035b9 --- /dev/null +++ b/html/form.html.php @@ -0,0 +1,46 @@ + + + + Title + + + + + + + + + +

name


+
+
+ + +

+ +
+
+
+
+
+
+
+               .n.                     |
+              /___\          _.---.  \ _ /
+              [|||]         (_._ ) )--;_) =-
+              [___]          '---'.__,' \
+              }-=-{                    |
+              |-" |
+              |.-"|                p
+       ~^=~^~-|_.-|~^-~^~ ~^~ -^~^~|\ ~^-~^~-
+       ^   .=.| _.|__  ^       ~  /| \
+        ~ /:. \" _|_/\    ~      /_|__\  ^
+       .-/::.  |   |""|-._    ^   ~~~~
+         `===-'-----'""`  '-.              ~
+      jgs               __.-'      ^
+        
+
+
Privacy | About +
+ + diff --git a/html/grave/error.html.php b/html/grave/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/grave/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/grave/graveyard.php b/html/grave/graveyard.php new file mode 100755 index 0000000..ed92a80 --- /dev/null +++ b/html/grave/graveyard.php @@ -0,0 +1,193 @@ += '".$startID."' AND id <= '".$endID."'"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + else + { + //check graveyard for rows that are reserverd within reservetime. Do not select reserved rows. If reserved rows exceed 30mins, they can be reserved by different approver. + $result = mysqli_query($link,"SELECT * FROM graveyard WHERE reserved IS NULL OR reserved = '".$_SESSION["user"]."' OR reservetime < NOW() - INTERVAL 30 MINUTE LIMIT $lim"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + //lets put contents of index into an array + while($row = mysqli_fetch_array($result)) + { + $id[] = $row['id']; + $url[] = str_replace("'", "%27", $row['url']); + $worksafe[] = $row['worksafe']; + } + + if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['startid']) && $_SESSION["loadgraveyard"]==false) + { //store approved url list into indexqueue + $i=0; + $num_crawlers=1;//modify this variable to the number of crawlers you are using in parallel. + $crawler_id=1; + foreach($id as $pageid) + { + if($_POST["deny$pageid"] != 'on' && $_POST["skip$pageid"] != 'on') + { + + $worksafe = mysqli_real_escape_string($link, $_POST["worksafe$pageid"]); + if($worksafe == 'on') + { + $worksafe = 1; + } + else + { + $worksafe = 0; + } + + + if($_POST["surprise$pageid"] == 'on') + { + $surprise = 1; + } + else + { + $surprise = 0; + } + + if($_POST["forcerules$pageid"] == 'on') + { + $forcerules = 1; + } + else + { + $forcerules = 0; + } + + if($_POST["crawlrepeat$pageid"] == 'on') + { + $crawlrepeat = 1; + } + else + { + $crawlrepeat = 0; + } + + $updatable = $_POST["updatable$pageid"]; + $crawldepth = $_POST["crawldepth$pageid"]; + $crawlpages = $_POST["crawlpages$pageid"]; + $crawltype = $_POST["crawltype$pageid"]; + + $sql = "INSERT INTO indexqueue (url,worksafe,approver,surprise,updatable,crawl_depth,crawl_pages,crawl_type,force_rules,crawl_repeat,crawler_id) VALUES ('".$url[$i]."','".$worksafe."','".$_SESSION["user"]."','".$surprise."','".$updatable."','".$crawldepth."','".$crawlpages."','".$crawltype."','".$forcerules."','".$crawlrepeat."','".$crawler_id."')"; + if (!mysqli_query($link, $sql)) + { + $error = 'Error inserting into indexqueue: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + if($_POST["skip$pageid"] != 'on' || ($_POST["skip$pageid"] == 'on' && $_POST["deny$pageid"] == 'on')) + { + $result2 = mysqli_query($link,"DELETE FROM graveyard WHERE id = $pageid"); + if(!$result2) + { + $error = 'Error deleting from graveyard: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + $i++; + if($crawler_id == $num_crawlers){ + $crawler_id=1; + }else{ + $crawler_id++; + } + } + + $_SESSION["loadgraveyard"]=true; + unset($id); + unset($url); + unset($worksafe); + unset($startID); + unset($endID); + unset($result); + $link -> close(); + include 'graveyard.php'; + //include 'refresh.html'; + exit(); + } + else + { + $_SESSION["loadgraveyard"]=false; + //insert approver into reserved, reservetime will autoupdate, so that they cannot be taken by a different approver for 30 mins. + foreach($id as $pageid) + { + $result = mysqli_query($link,"UPDATE graveyard SET reserved = '".$_SESSION["user"]."' WHERE id = $pageid"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + //get total number of rows remaining in queue + $totalrows = mysqli_query($link,"select count(id) from graveyard"); + if(!$totalrows) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + //get result of total rows remaining in queue + while($row = mysqli_fetch_array($totalrows)) + { + $queuesize = $row['count(id)']; + echo $queuesize . " pages queued in total."; + } + + include 'graveyardqueue.html.php'; + } +?> diff --git a/html/grave/graveyardqueue.html.php b/html/grave/graveyardqueue.html.php new file mode 100755 index 0000000..514a0b4 --- /dev/null +++ b/html/grave/graveyardqueue.html.php @@ -0,0 +1,64 @@ + + + + + + + Graveyard + + + + + + + + +
+

Some pages awaiting review:

+ + + + +

+ +
+ + [Worksafe] + + [Worksafe] + + [Surprise] + [Skip] + [Deny] + [Updatable] + + [Crawl: Depth + Pages + Type + Enforce Rules + Repeat] +

+ + +
+ +
+ + +
+ + + diff --git a/html/grave/index.php b/html/grave/index.php new file mode 100755 index 0000000..546d9a7 --- /dev/null +++ b/html/grave/index.php @@ -0,0 +1,97 @@ + 'Strict']); + session_start(); + + if ( !isset($_POST['pass']) || !isset($_POST['user'])) + { + include 'login.html.php'; + } + else if( $_POST['user'] == '' || $_POST['pass'] == '') + { + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + else + { + if(!isset($_SESSION["authenticated"])) + { + include_once $_SERVER['DOCUMENT_ROOT'] . '/securimage/securimage.php'; + $securimage = new Securimage(); + if ($securimage->check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'login.html.php'; + exit(); + + } + } + + $link = mysqli_connect('localhost', 'approver', 'foobar'); + $user = mysqli_real_escape_string($link, $_POST['user']); + $pass = mysqli_real_escape_string($link, $_POST['pass']); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + $loginresult = mysqli_query($link,"SELECT hash, attempts FROM accounts WHERE name = '$user';"); + if(!$loginresult) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + //lets put contents of accounts into an array + while($rowaccounts = mysqli_fetch_array($loginresult)) + { + $hash[] = $rowaccounts['hash']; + $attempts[] = $rowaccounts['attempts']; + } + if(password_verify($pass,$hash[0]) && $attempts[0] < 5) + { + if($attempts[0]>0) + { + if (!mysqli_query($link, "UPDATE accounts SET attempts = '0' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + $_SESSION["authenticated"] = true; + $_SESSION["user"] = $user; + $_SESSION["loadgraveyard"]=true; + include 'graveyard.php'; + exit(); + } + else{ + $attempt = $attempts[0] + 1; + if (!mysqli_query($link, "UPDATE accounts SET attempts = '$attempt' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + } +?> + diff --git a/html/grave/login.html.php b/html/grave/login.html.php new file mode 100755 index 0000000..18aa986 --- /dev/null +++ b/html/grave/login.html.php @@ -0,0 +1,31 @@ + + + + + + + wiby.me + + + + + + + +
+ Username
+ Password

+ +
+ CAPTCHA Image +
+
+ + Reload Image +
+ +
+
+ + + diff --git a/html/index.php b/html/index.php new file mode 100755 index 0000000..87d8e98 --- /dev/null +++ b/html/index.php @@ -0,0 +1,520 @@ + 1 && $query[strlen($query)-1]==" "){ + $query = substr($query,0,strlen($query)-1); + } + if(strlen($query) > 1 && $query[0]==" "){ + $query = substr($query,1,strlen($query)); + } + + //check if user wants to search a different search engine (!) or time window + if(($query[0] == "!" || $query[0] == "&") && strlen($query) > 3) + { + //separate actual query from search redirect + $actualquery = ""; + $redirect = ""; + if($query[2] == " "){ + $redirect = substr($query, 1, 1); + for($i=3; $i NOW() - INTERVAL 1 DAY "; + $query = $actualquery; + }else if ($redirect == "tw"){ + $additions = $additions."AND date > NOW() - INTERVAL 7 DAY "; + $query = $actualquery; + }else if ($redirect == "tm"){ + $additions = $additions."AND date > NOW() - INTERVAL 30 DAY "; + $query = $actualquery; + }else if ($redirect == "ty"){ + $additions = $additions."AND date > NOW() - INTERVAL 365 DAY "; + $query = $actualquery; + }else{ + header('Location: '."/?q=$actualquery"); + exit(); + } + } + + //check if user wants to limit search to a specific website + if(strlen($query) > 5 && strcasecmp(substr($query, 0, 5),"site:")==0){ + //remove 'site:' + $query = substr($query, 5, strlen($query)-5); + //get site: + $site = strstr($query, ' ', true); + //now just get the search query + $query = strstr($query, ' ', false); + $query = substr($query, 1, strlen($query)-1); + //add to additions + $additions = $additions."AND url LIKE '%".$site."%' "; + } + + $page=0; + if (!isset($_REQUEST['p'])) + { + $offset=0; + } + else + { + $page = mysqli_real_escape_string($link, $_GET['p']); + $offset = $page; + if($offset > 0) + { + $offset--; + } + $offset = $offset * $lim; + } + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + + if (!mysqli_set_charset($link, 'utf8mb4')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + + $queryOriginal = $query; + + //Check if query is a url (contains http:// or https:// and no spaces). If so, put quotations around to to get an exact match + $urlDetected = 0; + //if(strpos($query, ' ') == false && strpos($query,'.') == true && strpos($query,'"') == false && preg_match('/http/',$query) == true) + if(strpos($query, ' ') == false && strpos($query,'.') == true && strpos($query,'"') == false)//note this will flag on file extensions also + { + $query = '"' . $query . '"'; + $urlDetected = 1; + } + + //did user manually set -https instead of settings cookie? + if(substr($query,-7) == " -https"){ + $filterHTTPS = true; + $query = substr($query, 0,-7); + } + + $queryNoQuotes = $query; + + //Are there quotes in the query? + $exactMatch = false; + if(strpos($queryNoQuotes,'"') !== false) + { + $exactMatch = true; + } + + //alright then lets remove the quotes + if($exactMatch == true) + { + while(strpos($queryNoQuotes,'"') !== false) + { + $queryNoQuotes = str_replace('"', "",$queryNoQuotes); + } + } + + //remove the '*' if contained anywhere in queryNoQuotes + if(strpos($queryNoQuotes,'*') !== false && $exactMatch == false){ + $queryNoQuotes = str_replace('*', "",$queryNoQuotes); + } + + //remove any flags inside queryNoQuotes, also grab any required words (+ prefix) + $queryNoQuotesOrFlags = $queryNoQuotes; + $requiredword = ''; + $flags = ''; + $wordlen = 0; + $flagssetbyuser = 0; + $numRequiredWords = 0; + if(strpos($queryNoQuotes,'+') !== false || strpos($queryNoQuotes,'-') !== false){ + $words = explode(' ', $queryNoQuotes); + $i = 0; + $queryNoQuotesOrFlags = ''; + foreach ($words as $word) { + $wordlen = strlen($word); + if($word != '' && $i != 0 && $word[0] != '-' && $word[0] != '+'){ + $queryNoQuotesOrFlags .= ' '; + } + if ($word != '' && $word[0] != '-' && $word[0] != '+'){ + $queryNoQuotesOrFlags .= $word; + } + if ($word != '' && $word[0] == '+' && strlen($word) > 1 && $requiredword == '' && strpos($queryNoQuotes,'-') !== false){ + $requiredword = substr($word,1); + } + if ($word != '' && ($word[0] == '-' || $word[0] == '+')){ + $flags .= " $word"; + $flagssetbyuser++; + if($word[0] == '+'){ + $numRequiredWords++; + } + } + $i++; + } + $flags = checkformat($flags); + } + + //$queryNoQuotes_SQLsafe = mysqli_real_escape_string($link, $queryNoQuotes); + //$flags = mysqli_real_escape_string($link, $flags); + + $words = explode(' ', $queryNoQuotesOrFlags); + $wordcount = 0; + $longestWord = ''; + //find longest word in query + $longestWordLength = 0; + $longestwordelementnum = 0; + foreach ($words as $word) { + if (strlen($word) > $longestWordLength) { + $longestWordLength = strlen($word); + $longestWord = $word; + $longestwordelementnum = $wordcount; + } + if($word != ''){ + $wordcount++; + } + } + + //create another query where all compatible words from queryNoQuotesOrFlags are marked as keywords + $reqwordQuery = ''; + $i=0; + $wordlen=0; + foreach ($words as $word) { + $wordlen = strlen($word); + if($i==0 && $wordlen > 3 && ($word[0] == '+' || $word[0] == '-')){ + $reqwordQuery .= "$word"; + }else if($i==0 && $wordlen > 1 && $word[0] != '+' && $word[0] != '-'){ + if($wordlen > 2){ + $reqwordQuery .= "+$word"; + }else{ + $reqwordQuery .= "$word"; + } + }else if($i==0){ + $reqwordQuery .= "$word"; + } + if($i!=0 && $wordlen > 3 && ($word[0] == '+' || $word[0] == '-')){ + $reqwordQuery .= " $word"; + }else if($i!=0 && $wordlen > 1 && $word[0] != '+' && $word[0] != '-' ){ + if($wordlen > 2){ + $reqwordQuery .= " +$word"; + }else{ + $reqwordQuery .= " $word"; + } + }else if($i!=0){ + $reqwordQuery .= " $word"; + } + $i++; + } + $reqwordQuery = checkformat($reqwordQuery); + $reqwordQuery .= " $flags"; + + //if no required words set, make the longest word in the query required. + $querywithrequiredword = ""; + if($numRequiredWords == 0 && $wordcount > 1 && $longestWordLength > 2){ + $querywithrequiredword = $query .= " +$longestWord"; + } + + if($filterHTTPS == true){ + $additions = $additions."AND http = '1' "; + } + if($worksafe == true){ + $additions = $additions."AND worksafe = '1' "; + } + + $count = 0; + + $queryWithQuotesAndFlags = '"'. $queryNoQuotesOrFlags.'"'.$flags.''; + $queryWithQuotes = '"'. $queryNoQuotesOrFlags.'"'; + + //if query is just 1 or 2 letters, help make it work. + if(iconv_strlen($queryOriginal) < 3){ + $query = "".$query."*"; + $queryWithQuotesAndFlags = $query; + $reqwordQuery = $query; + } + if(stripos($queryOriginal,"c++")!==false){// :) :( :) :( + $exactMatch=true; + $queryWithQuotesAndFlags .= " +programming"; + if(strpos($queryOriginal," ")!==false && $longestWordLength>3){ + $queryWithQuotesAndFlags .= " +$longestWord"; + } + } + + if($querywithrequiredword != ""){ + $querytouse = $querywithrequiredword; + }else if($numRequiredWords > 0){ + $querytouse = $reqwordQuery; + }else{ + $querytouse = $query; + } + + if($exactMatch == false && $urlDetected == false){ + $querytouse = checkformat($querytouse); + $reqwordQuery = checkformat($reqwordQuery); + } + + //perform full text search FOR InnoDB or MyISAM STORAGE ENGINE + if(($exactMatch !== true || $flagssetbyuser > 0) && $urlDetected==0 && strpos($query, ' ') == true && $flagssetbyuser + $wordcount != $flagssetbyuser){ + $outputFTS = mysqli_query($link, "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('$querytouse' IN BOOLEAN MODE) AND enable = '1' $additions ORDER BY CASE WHEN MATCH(tags) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) THEN 30 WHEN MATCH(title) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) OR MATCH(description) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) THEN 15 WHEN MATCH(title) AGAINST('$reqwordQuery' IN BOOLEAN MODE) THEN 14 WHEN MATCH(title) AGAINST('$querytouse' IN BOOLEAN MODE) THEN 13 END DESC, id DESC LIMIT $lim OFFSET $offset"); + }else{ + $outputFTS = mysqli_query($link, "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('$queryWithQuotesAndFlags' IN BOOLEAN MODE) AND enable = '1' $additions ORDER BY CASE WHEN MATCH(tags) AGAINST('$queryWithQuotesAndFlags' IN BOOLEAN MODE) THEN 30 WHEN MATCH(title) AGAINST('$queryWithQuotesAndFlags' IN BOOLEAN MODE) THEN 20 END DESC, id DESC LIMIT $lim OFFSET $offset"); + } + + if($urlDetected == 1) + { + $query = $queryOriginal; + } + + //this will get set if position of longest word of query is found within body + $pos = -1; + + //lets put contents of the full text search into the array + while($row = mysqli_fetch_array($outputFTS)) + { + //put the contents of the URL column within the DB into an array + $id[] = $row[0]; + $url[] = $row[1]; + $title[] = substr($row[2],0,150); + $description[] = substr($row[3],0,180); + $body = $row[4]; + $count++; + $lastID = $row[0]; + + $longestWord = str_replace("''", "'",$longestWord); + $queryNoQuotesOrFlags = str_replace("''", "'",$queryNoQuotesOrFlags); + + if($exactMatch == false && ($numRequiredWords == 0 || $numRequiredWords + $wordcount == $numRequiredWords)) + { + //remove the '*' at the end of the longest word if present + //$longestWord = str_replace('*', "",$longestWord); + + //first find an exact + if(strlen($requiredword) > 0){ + $pos = stripos($body, $requiredword); + }else{ + $pos = stripos($body, $queryNoQuotesOrFlags); + } + + //search within body for position of longest query word. If not found, try another word + if($pos == false){ + $pos = stripos($body, $longestWord); + if($pos == false && $wordcount > 1) + { + if($longestwordelementnum > 0) + { + //if(strpos($words[longestwordelementnum],'*') == true)//remove the '*' at the end of the query if present + //$words[longestwordelementnum] = str_replace('*', "",$words[0]); + $pos = stripos($body, $words[$longestwordelementnum]); + } + else if($longestwordelementnum == 0) + { + //if(strpos($words[1],'*') == true)//remove the '*' at the end of the query if present + //$words[1] = str_replace('*', "",$words[1]); + $pos = stripos($body, $words[1]); + } + } + } + } + else + { + $pos = stripos($body, $queryNoQuotesOrFlags); + } + //still not found?, set position to 0 + if($pos == false){ + $pos = 0; + } + + //get all positions of all keywords in body + /* $lastPos = 0; + $positions = array(); + foreach($words as $word) + { + while (($lastPos = mb_strpos($body, $word, $lastPos))!== false) { + $positions[$word][] = $lastPos; + $lastPos = $lastPos + strlen($word); + } + }*/ + + //figure out how much preceding text to use + if($pos < 32) + $starttext = 0; + else if($pos > 25) + $starttext = $pos - 25; + else if($pos > 20) + $starttext = $pos - 15; + //else $starttext = 0; + + //total length of the ballpark + $textlength = 180; + + //populate the ballpark + if($pos >= 0) + { + $ballparktext = substr($body,$starttext,$textlength); + } + else $ballpark = '0'; + + //find position of nearest Period + $foundPeriod = true; + $posPeriod = stripos($ballparktext, '. ') + $starttext +1; + + //find position of nearest Space + $foundSpace = true; + $posSpace = stripos($ballparktext, ' ') + $starttext; + + //if longest word in query is after a period+space within ballpark, reset $starttext to that point + if($pos-$starttext > $posPeriod) + { + $starttext = $posPeriod; + //populate the bodymatch + if($pos-$starttext >= 0) + { + $bodymatch[] = substr($body,$starttext,$textlength); + } + else $bodymatch[] = ''; + } + //else if($pos-starttext > $posSpace)//else if longest word in query is after a space within ballpark, reset $starttext to that point + else if($pos > $posSpace)//else if longest word in query is after a space within ballpark, reset $starttext to that point + { + $starttext = $posSpace; + //populate the bodymatch + if($pos-$starttext >= 0) + { + $bodymatch[] = substr($body,$starttext,$textlength); + } + else $bodymatch[] = ''; + } + else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if($pos-$starttext >= 0) + { + $bodymatch[] = $ballparktext; + } + else $bodymatch[] = ''; + } + + } + + $query = $_GET['q']; + $row = null; + + if($page == 0){ + $page+=2; + }else{ + $page++; + } + + include 'results.html.php'; +} + +function checkformat($query){ + //Check if query contains a hyphenated word. Replace hyphens with a space, drop at hyphen if set as required word. + if(strpos($query,'-') !== false || strpos($query,'+')){ + $hyphenwords = explode(' ',$query); + $query = ''; + $quotes = 0; + $i = 0; + foreach ($hyphenwords as $word) { + if(strpos($query,'"') !== false){ + $quotes++; + } + if((strpos($word,'-') !== false || strpos($word,'+') !== false) && $word[0] != '-' && $word[0] != '+' && $quotes%2 == 0){ //if hyphen or plus exists, not a flag, not wrapped in quotes already + $word = str_replace("-", " ",$word); + }else if(strpos($word,'+') !== false && $word[0] == '+'){//if hyphen exists and is a required word + $word = str_replace("-", " ",$word); + $spos = strpos($word, " "); + if($spos !== false){ + $word = substr($word,0,$spos);//drop at hyphen if found + } + } + if(strlen($word)>1 && $word[0]=='+' && strlen($word)<4){ + $word = substr($word,1); + } + if($i > 0){ + $query .= ' '; + } + $query .= $word; + $i++; + } + } + return $query; +} +?> diff --git a/html/insert/error.html.php b/html/insert/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/insert/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/insert/form.html.php b/html/insert/form.html.php new file mode 100755 index 0000000..d0c4e38 --- /dev/null +++ b/html/insert/form.html.php @@ -0,0 +1,96 @@ + + + + + + + Add to Index + + + + + + + + + +
+ +
+ + + + + +
+ +
+ + + +
+ +
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ + + +
+
+ +
+ + + + diff --git a/html/insert/index.php b/html/insert/index.php new file mode 100755 index 0000000..b542f2f --- /dev/null +++ b/html/insert/index.php @@ -0,0 +1,96 @@ + 'Strict']); + session_start(); + + if ( !isset($_POST['pass']) || !isset($_POST['user'])) + { + include 'login.html.php'; + } + else if( $_POST['user'] == '' || $_POST['pass'] == '') + { + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + else + { + if(!isset($_SESSION["authenticated"])) + { + include_once $_SERVER['DOCUMENT_ROOT'] . '/securimage/securimage.php'; + $securimage = new Securimage(); + if ($securimage->check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'login.html.php'; + exit(); + + } + } + + $link = mysqli_connect('localhost', 'approver', 'foobar'); + $user = mysqli_real_escape_string($link, $_POST['user']); + $pass = mysqli_real_escape_string($link, $_POST['pass']); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + $loginresult = mysqli_query($link,"SELECT hash, attempts FROM accounts WHERE name = '$user';"); + if(!$loginresult) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + //lets put contents of accounts into an array + while($rowaccounts = mysqli_fetch_array($loginresult)) + { + $hash[] = $rowaccounts['hash']; + $attempts[] = $rowaccounts['attempts']; + } + if(password_verify($pass,$hash[0]) && $attempts[0] < 5) + { + if($attempts[0]>0) + { + if (!mysqli_query($link, "UPDATE accounts SET attempts = '0' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + $_SESSION["authenticated"] = true; + $_SESSION["user"] = $user; + include 'insert.php'; + exit(); + } + else{ + $attempt = $attempts[0] + 1; + if (!mysqli_query($link, "UPDATE accounts SET attempts = '$attempt' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + } +?> + diff --git a/html/insert/insert.html.php b/html/insert/insert.html.php new file mode 100755 index 0000000..08e5852 --- /dev/null +++ b/html/insert/insert.html.php @@ -0,0 +1,25 @@ + + + + + + + Form Example + + + + + + + +

+ + + +

+ + + + diff --git a/html/insert/insert.php b/html/insert/insert.php new file mode 100755 index 0000000..5c6b111 --- /dev/null +++ b/html/insert/insert.php @@ -0,0 +1,115 @@ + + + diff --git a/html/insert/login.html.php b/html/insert/login.html.php new file mode 100755 index 0000000..5b622eb --- /dev/null +++ b/html/insert/login.html.php @@ -0,0 +1,31 @@ + + + + + + + wiby.me + + + + + + + +
+ Username
+ Password

+ +
+ CAPTCHA Image +
+
+ + Reload Image +
+ +
+
+ + + diff --git a/html/json/error.html.php b/html/json/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/json/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/json/form.html.php b/html/json/form.html.php new file mode 100755 index 0000000..b6b006d --- /dev/null +++ b/html/json/form.html.php @@ -0,0 +1,19 @@ + + + + JSON API + + + + +

Using JSON API

+

Use https://domain.com/json/ to get a JSON output of search results.

+ Example: https://domain.com/json/?q=test outputs results for the query 'test'.

+ Append the parameter &p=NUM to get the next page of results.

+ Example: https://domain.com/json/?q=test&p=2

+ Append the parameter &nsfw to include results that are marked as not safe for work. +

Terms of Use: +
1. Use this service at your own risk. +

+ + diff --git a/html/json/index.php b/html/json/index.php new file mode 100755 index 0000000..d7eb7a5 --- /dev/null +++ b/html/json/index.php @@ -0,0 +1,513 @@ + 1 && $query[strlen($query)-1]==" "){ + $query = substr($query,0,strlen($query)-1); + } + if(strlen($query) > 1 && $query[0]==" "){ + $query = substr($query,1,strlen($query)); + } + + //check if user wants to search a different time window + if(($query[0] == "!" || $query[0] == "&") && strlen($query) > 3) + { + //separate actual query from search redirect + $actualquery = ""; + $redirect = ""; + if($query[2] == " "){ + $redirect = substr($query, 1, 1); + for($i=3; $i NOW() - INTERVAL 1 DAY "; + $query = $actualquery; + }else if ($redirect == "tw"){ + $additions = $additions."AND date > NOW() - INTERVAL 7 DAY "; + $query = $actualquery; + }else if ($redirect == "tm"){ + $additions = $additions."AND date > NOW() - INTERVAL 30 DAY "; + $query = $actualquery; + }else if ($redirect == "ty"){ + $additions = $additions."AND date > NOW() - INTERVAL 365 DAY "; + $query = $actualquery; + }else{ + header('Location: '."/?q=$actualquery"); + exit(); + } + } + + //check if user wants to limit search to a specific website + if(strlen($query) > 5 && strcasecmp(substr($query, 0, 5),"site:")==0){ + //remove 'site:' + $query = substr($query, 5, strlen($query)-5); + //get site: + $site = strstr($query, ' ', true); + //now just get the search query + $query = strstr($query, ' ', false); + $query = substr($query, 1, strlen($query)-1); + //add to additions + $additions = $additions."AND url LIKE '%".$site."%' "; + } + + $page=0; + if (!isset($_REQUEST['p'])) + { + $offset=0; + } + else + { + $page = mysqli_real_escape_string($link, $_GET['p']); + $offset = $page; + if($offset > 0) + { + $offset--; + } + $offset = $offset * $lim; + } + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + + if (!mysqli_set_charset($link, 'utf8mb4')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + + $queryOriginal = $query; + + //Check if query is a url (contains http:// or https:// and no spaces). If so, put quotations around to to get an exact match + $urlDetected = 0; + //if(strpos($query, ' ') == false && strpos($query,'.') == true && strpos($query,'"') == false && preg_match('/http/',$query) == true) + if(strpos($query, ' ') == false && strpos($query,'.') == true && strpos($query,'"') == false)//note this will flag on file extensions also + { + $query = '"' . $query . '"'; + $urlDetected = 1; + } + + //did user manually set -https instead of settings cookie? + if(substr($query,-7) == " -https"){ + $filterHTTPS = true; + $query = substr($query, 0,-7); + } + + $queryNoQuotes = $query; + + //Are there quotes in the query? + $exactMatch = false; + if(strpos($queryNoQuotes,'"') !== false) + { + $exactMatch = true; + } + + //alright then lets remove the quotes + if($exactMatch == true) + { + while(strpos($queryNoQuotes,'"') !== false) + { + $queryNoQuotes = str_replace('"', "",$queryNoQuotes); + } + } + + //remove the '*' if contained anywhere in queryNoQuotes + if(strpos($queryNoQuotes,'*') !== false && $exactMatch == false){ + $queryNoQuotes = str_replace('*', "",$queryNoQuotes); + } + + //remove any flags inside queryNoQuotes, also grab any required words (+ prefix) + $queryNoQuotesOrFlags = $queryNoQuotes; + $requiredword = ''; + $flags = ''; + $wordlen = 0; + $flagssetbyuser = 0; + $numRequiredWords = 0; + if(strpos($queryNoQuotes,'+') !== false || strpos($queryNoQuotes,'-') !== false){ + $words = explode(' ', $queryNoQuotes); + $i = 0; + $queryNoQuotesOrFlags = ''; + foreach ($words as $word) { + $wordlen = strlen($word); + if($word != '' && $i != 0 && $word[0] != '-' && $word[0] != '+'){ + $queryNoQuotesOrFlags .= ' '; + } + if ($word != '' && $word[0] != '-' && $word[0] != '+'){ + $queryNoQuotesOrFlags .= $word; + } + if ($word != '' && $word[0] == '+' && strlen($word) > 1 && $requiredword == '' && strpos($queryNoQuotes,'-') !== false){ + $requiredword = substr($word,1); + } + if ($word != '' && ($word[0] == '-' || $word[0] == '+')){ + $flags .= " $word"; + $flagssetbyuser++; + if($word[0] == '+'){ + $numRequiredWords++; + } + } + $i++; + } + $flags = checkformat($flags); + } + + //$queryNoQuotes_SQLsafe = mysqli_real_escape_string($link, $queryNoQuotes); + //$flags = mysqli_real_escape_string($link, $flags); + + $words = explode(' ', $queryNoQuotesOrFlags); + $wordcount = 0; + $longestWord = ''; + //find longest word in query + $longestWordLength = 0; + $longestwordelementnum = 0; + foreach ($words as $word) { + if (strlen($word) > $longestWordLength) { + $longestWordLength = strlen($word); + $longestWord = $word; + $longestwordelementnum = $wordcount; + } + if($word != ''){ + $wordcount++; + } + } + + //create another query where all compatible words from queryNoQuotesOrFlags are marked as keywords + $reqwordQuery = ''; + $i=0; + $wordlen=0; + foreach ($words as $word) { + $wordlen = strlen($word); + if($i==0 && $wordlen > 3 && ($word[0] == '+' || $word[0] == '-')){ + $reqwordQuery .= "$word"; + }else if($i==0 && $wordlen > 1 && $word[0] != '+' && $word[0] != '-'){ + if($wordlen > 2){ + $reqwordQuery .= "+$word"; + }else{ + $reqwordQuery .= "$word"; + } + }else if($i==0){ + $reqwordQuery .= "$word"; + } + if($i!=0 && $wordlen > 3 && ($word[0] == '+' || $word[0] == '-')){ + $reqwordQuery .= " $word"; + }else if($i!=0 && $wordlen > 1 && $word[0] != '+' && $word[0] != '-' ){ + if($wordlen > 2){ + $reqwordQuery .= " +$word"; + }else{ + $reqwordQuery .= " $word"; + } + }else if($i!=0){ + $reqwordQuery .= " $word"; + } + $i++; + } + $reqwordQuery = checkformat($reqwordQuery); + $reqwordQuery .= " $flags"; + + //if no required words set, make the longest word in the query required. + $querywithrequiredword = ""; + if($numRequiredWords == 0 && $wordcount > 1 && $longestWordLength > 2){ + $querywithrequiredword = $query .= " +$longestWord"; + } + + if($filterHTTPS == true){ + $additions = $additions."AND http = '1' "; + } + if($worksafe == true){ + $additions = $additions."AND worksafe = '1' "; + } + + $count = 0; + + $queryWithQuotesAndFlags = '"'. $queryNoQuotesOrFlags.'"'.$flags.''; + $queryWithQuotes = '"'. $queryNoQuotesOrFlags.'"'; + + //if query is just 1 or 2 letters, help make it work. + if(iconv_strlen($queryOriginal) < 3){ + $query = "".$query."*"; + $queryWithQuotesAndFlags = $query; + $reqwordQuery = $query; + } + if(stripos($queryOriginal,"c++")!==false){// :) :( :) :( + $exactMatch=true; + $queryWithQuotesAndFlags .= " +programming"; + if(strpos($queryOriginal," ")!==false && $longestWordLength>3){ + $queryWithQuotesAndFlags .= " +$longestWord"; + } + } + + if($querywithrequiredword != ""){ + $querytouse = $querywithrequiredword; + }else if($numRequiredWords > 0){ + $querytouse = $reqwordQuery; + }else{ + $querytouse = $query; + } + + if($exactMatch == false && $urlDetected == false){ + $querytouse = checkformat($querytouse); + $reqwordQuery = checkformat($reqwordQuery); + } + + //perform full text search FOR InnoDB or MyISAM STORAGE ENGINE + if(($exactMatch !== true || $flagssetbyuser > 0) && $urlDetected==0 && strpos($query, ' ') == true && $flagssetbyuser + $wordcount != $flagssetbyuser){ + $outputFTS = mysqli_query($link, "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('$querytouse' IN BOOLEAN MODE) AND enable = '1' $additions ORDER BY CASE WHEN MATCH(tags) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) THEN 30 WHEN MATCH(title) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) THEN 20 WHEN MATCH(body) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) OR MATCH(description) AGAINST('$queryWithQuotes' IN BOOLEAN MODE) THEN 15 WHEN MATCH(title) AGAINST('$reqwordQuery' IN BOOLEAN MODE) THEN 14 WHEN MATCH(title) AGAINST('$querytouse' IN BOOLEAN MODE) THEN 13 END DESC, id DESC LIMIT $lim OFFSET $offset"); + }else{ + $outputFTS = mysqli_query($link, "SELECT id, url, title, description, body FROM windex WHERE MATCH(tags, body, description, title, url) AGAINST('$queryWithQuotesAndFlags' IN BOOLEAN MODE) AND enable = '1' $additions ORDER BY CASE WHEN MATCH(tags) AGAINST('$queryWithQuotesAndFlags' IN BOOLEAN MODE) THEN 30 WHEN MATCH(title) AGAINST('$queryWithQuotesAndFlags' IN BOOLEAN MODE) THEN 20 END DESC, id DESC LIMIT $lim OFFSET $offset"); + } + + if($urlDetected == 1) + { + $query = $queryOriginal; + } + + //this will get set if position of longest word of query is found within body + $pos = -1; + + //lets put contents of the full text search into the array + while($row = mysqli_fetch_array($outputFTS)) + { + //put the contents of the URL column within the DB into an array + $id[] = $row[0]; + $url[] = $row[1]; + $title[] = JSONRealEscapeString(substr($row[2],0,150)); + $description[] = JSONRealEscapeString(substr($row[3],0,180)); + $body = JSONRealEscapeString($row[4]); + $count++; + $lastID = $row[0]; + + $longestWord = str_replace("''", "'",$longestWord); + $queryNoQuotesOrFlags = str_replace("''", "'",$queryNoQuotesOrFlags); + + if($exactMatch == false && ($numRequiredWords == 0 || $numRequiredWords + $wordcount == $numRequiredWords)) + { + //remove the '*' at the end of the longest word if present + //$longestWord = str_replace('*', "",$longestWord); + + //first find an exact + if(strlen($requiredword) > 0){ + $pos = stripos($body, $requiredword); + }else{ + $pos = stripos($body, $queryNoQuotesOrFlags); + } + + //search within body for position of longest query word. If not found, try another word + if($pos == false){ + $pos = stripos($body, $longestWord); + if($pos == false && $wordcount > 1) + { + if($longestwordelementnum > 0) + { + //if(strpos($words[longestwordelementnum],'*') == true)//remove the '*' at the end of the query if present + //$words[longestwordelementnum] = str_replace('*', "",$words[0]); + $pos = stripos($body, $words[$longestwordelementnum]); + } + else if($longestwordelementnum == 0) + { + //if(strpos($words[1],'*') == true)//remove the '*' at the end of the query if present + //$words[1] = str_replace('*', "",$words[1]); + $pos = stripos($body, $words[1]); + } + } + } + } + else + { + $pos = stripos($body, $queryNoQuotesOrFlags); + } + //still not found?, set position to 0 + if($pos == false){ + $pos = 0; + } + + //get all positions of all keywords in body + /* $lastPos = 0; + $positions = array(); + foreach($words as $word) + { + while (($lastPos = mb_strpos($body, $word, $lastPos))!== false) { + $positions[$word][] = $lastPos; + $lastPos = $lastPos + strlen($word); + } + }*/ + + //figure out how much preceding text to use + if($pos < 32) + $starttext = 0; + else if($pos > 25) + $starttext = $pos - 25; + else if($pos > 20) + $starttext = $pos - 15; + //else $starttext = 0; + + //total length of the ballpark + $textlength = 180; + + //populate the ballpark + if($pos >= 0) + { + $ballparktext = substr($body,$starttext,$textlength); + } + else $ballpark = '0'; + + //find position of nearest Period + $foundPeriod = true; + $posPeriod = stripos($ballparktext, '. ') + $starttext +1; + + //find position of nearest Space + $foundSpace = true; + $posSpace = stripos($ballparktext, ' ') + $starttext; + + //if longest word in query is after a period+space within ballpark, reset $starttext to that point + if($pos-$starttext > $posPeriod) + { + $starttext = $posPeriod; + //populate the bodymatch + if($pos-$starttext >= 0) + { + $bodymatch[] = substr($body,$starttext,$textlength); + } + else $bodymatch[] = ''; + } + //else if($pos-starttext > $posSpace)//else if longest word in query is after a space within ballpark, reset $starttext to that point + else if($pos > $posSpace)//else if longest word in query is after a space within ballpark, reset $starttext to that point + { + $starttext = $posSpace; + //populate the bodymatch + if($pos-$starttext >= 0) + { + $bodymatch[] = substr($body,$starttext,$textlength); + } + else $bodymatch[] = ''; + } + else //else just set the bodymatch to the ballparktext + { + //populate the bodymatch + if($pos-$starttext >= 0) + { + $bodymatch[] = $ballparktext; + } + else $bodymatch[] = ''; + } + + } + + $query = $_GET['q']; + $row = null; + + if($page == 0){ + $page+=2; + }else{ + $page++; + } + + include 'results.json.php'; +} + +function checkformat($query){ + //Check if query contains a hyphenated word. Replace hyphens with a space, drop at hyphen if set as required word. + if(strpos($query,'-') !== false || strpos($query,'+')){ + $hyphenwords = explode(' ',$query); + $query = ''; + $quotes = 0; + $i = 0; + foreach ($hyphenwords as $word) { + if(strpos($query,'"') !== false){ + $quotes++; + } + if((strpos($word,'-') !== false || strpos($word,'+') !== false) && $word[0] != '-' && $word[0] != '+' && $quotes%2 == 0){ //if hyphen or plus exists, not a flag, not wrapped in quotes already + $word = str_replace("-", " ",$word); + }else if(strpos($word,'+') !== false && $word[0] == '+'){//if hyphen exists and is a required word + $word = str_replace("-", " ",$word); + $spos = strpos($word, " "); + if($spos !== false){ + $word = substr($word,0,$spos);//drop at hyphen if found + } + } + if(strlen($word)>1 && $word[0]=='+' && strlen($word)<4){ + $word = substr($word,1); + } + if($i > 0){ + $query .= ' '; + } + $query .= $word; + $i++; + } + } + return $query; +} + +function JSONRealEscapeString($var){ + $var = str_replace("\\","\\\\",$var); + $var = str_replace("\t","\\t",$var); + $var = str_replace("\b","\\b",$var); + $var = str_replace("\n","\\n",$var); + $var = str_replace("\r","\\r",$var); + $var = str_replace("\f","\\f",$var); + return $var; +} + +?> + diff --git a/html/json/results.json.php b/html/json/results.json.php new file mode 100755 index 0000000..0567269 --- /dev/null +++ b/html/json/results.json.php @@ -0,0 +1,20 @@ + +[ + + + + + +",">",$title[$i]); ?> +",">",$bodymatch[$i]); ?> +",">",$description[$i]); ?> + { + "URL": "", + "Title": "", + "Snippet": "", + "Description": "" + }, + + +] + diff --git a/html/opensearch.xml b/html/opensearch.xml new file mode 100755 index 0000000..7b71425 --- /dev/null +++ b/html/opensearch.xml @@ -0,0 +1,14 @@ + + + Title + Enter your description + web internet cyber interest subject + Put a contact if you want + + Long title name + URL_TO_FAVICON.ICO_FILE + + en-us + UTF-8 + URL_TO_YOUR_SEARCH_ENGINE + diff --git a/html/readf/error.html.php b/html/readf/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/readf/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/readf/feedback.php b/html/readf/feedback.php new file mode 100755 index 0000000..3aec3a7 --- /dev/null +++ b/html/readf/feedback.php @@ -0,0 +1,111 @@ += '".$startID."' AND id <= '".$endID."'"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + else + { + $result = mysqli_query($link,"SELECT * FROM feedback LIMIT $lim"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + //lets put contents of index into an array + while($row = mysqli_fetch_array($result)) + { + $id[] = $row['id']; + $message[] = $row['message']; + $time[] = $row['time']; + } + + if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['startid']) && $_SESSION["loadfeedback"]==false) + { //remove selected feedback + $i=0; + foreach($id as $pageid) + { + + if($_POST["drop$pageid"] == 'on') + { + $result2 = mysqli_query($link,"DELETE FROM feedback WHERE id = '".$pageid."'"); + if(!$result2) + { + $error = 'Error deleting from feedback: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + $i++; + } + $_SESSION["loadfeedback"]=true; + unset($id); + unset($message); + unset($time); + unset($startID); + unset($endID); + unset($result); + unset($result2); + $link -> close(); + include 'feedback.php'; + exit(); + } + else + { + $_SESSION["loadfeedback"]=false; + include 'form.html.php'; + } +?> + + diff --git a/html/readf/form.html.php b/html/readf/form.html.php new file mode 100755 index 0000000..35fa4ad --- /dev/null +++ b/html/readf/form.html.php @@ -0,0 +1,47 @@ + + + + + + + Feedback + + + + + + + + +
+

Some feedback awaiting review:



+ + + + +

+ +

+ Time:
+ [Drop] + +


+ + +
+ +
+ + +
+ + + diff --git a/html/readf/index.php b/html/readf/index.php new file mode 100755 index 0000000..f9a4288 --- /dev/null +++ b/html/readf/index.php @@ -0,0 +1,98 @@ + 'Strict']); + session_start(); + + if ( !isset($_POST['pass']) || !isset($_POST['user'])) + { + include 'login.html.php'; + } + else if( $_POST['user'] == '' || $_POST['pass'] == '') + { + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + else + { + if(!isset($_SESSION["authenticated"])) + { + include_once $_SERVER['DOCUMENT_ROOT'] . '/securimage/securimage.php'; + $securimage = new Securimage(); + if ($securimage->check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'login.html.php'; + exit(); + + } + } + + $link = mysqli_connect('localhost', 'approver', 'foobar'); + $user = mysqli_real_escape_string($link, $_POST['user']); + $pass = mysqli_real_escape_string($link, $_POST['pass']); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + $loginresult = mysqli_query($link,"SELECT hash, attempts, level FROM accounts WHERE name = '$user';"); + if(!$loginresult) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + //lets put contents of accounts into an array + while($rowaccounts = mysqli_fetch_array($loginresult)) + { + $hash[] = $rowaccounts['hash']; + $attempts[] = $rowaccounts['attempts']; + $level[] = $rowaccounts['level']; + } + if(password_verify($pass,$hash[0]) && $attempts[0] < 5 && $level[0] == "admin") + { + if($attempts[0]>0) + { + if (!mysqli_query($link, "UPDATE accounts SET attempts = '0' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + $_SESSION["authenticated"] = true; + $_SESSION["user"] = $user; + $_SESSION["level"] = $level[0]; + include 'feedback.php'; + exit(); + } + else{ + $attempt = $attempts[0] + 1; + if (!mysqli_query($link, "UPDATE accounts SET attempts = '$attempt' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + } +?> + diff --git a/html/readf/login.html.php b/html/readf/login.html.php new file mode 100755 index 0000000..fec308d --- /dev/null +++ b/html/readf/login.html.php @@ -0,0 +1,31 @@ + + + + + + + wiby.me + + + + + + + +
+ Username
+ Password

+ +
+ CAPTCHA Image +
+
+ + Reload Image +
+ +
+
+ + + diff --git a/html/results.html.php b/html/results.html.php new file mode 100755 index 0000000..f3f4cc5 --- /dev/null +++ b/html/results.html.php @@ -0,0 +1,44 @@ + + + + + <?php echo htmlspecialchars($query, ENT_QUOTES, 'UTF-8');?> + + + + + + +
+
+ name   + + +
+

+
+ +


+ + + + + + ",">",$title[$i]); ?> + ",">",$bodymatch[$i]); ?> + ",">",$description[$i]); ?> +

+

+ +
+ +
+ + + 2 && $starappend == 0): ?> +


Find more...
+ +


That's everything I could find.
Help make me smarter by submitting a page.

+ + + diff --git a/html/review/error.html.php b/html/review/error.html.php new file mode 100755 index 0000000..ca173d0 --- /dev/null +++ b/html/review/error.html.php @@ -0,0 +1,25 @@ + + + + + + + PHP Error Output + + + + + + + +

+ + + +

+ + + + diff --git a/html/review/index.php b/html/review/index.php new file mode 100755 index 0000000..08e1248 --- /dev/null +++ b/html/review/index.php @@ -0,0 +1,97 @@ + 'Strict']); + session_start(); + + if ( !isset($_POST['pass']) || !isset($_POST['user'])) + { + include 'login.html.php'; + } + else if( $_POST['user'] == '' || $_POST['pass'] == '') + { + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + else + { + if(!isset($_SESSION["authenticated"])) + { + include_once $_SERVER['DOCUMENT_ROOT'] . '/securimage/securimage.php'; + $securimage = new Securimage(); + if ($securimage->check($_POST['captcha_code']) == false) + { + echo "The security code entered was incorrect."; + include 'login.html.php'; + exit(); + + } + } + + $link = mysqli_connect('localhost', 'approver', 'foobar'); + $user = mysqli_real_escape_string($link, $_POST['user']); + $pass = mysqli_real_escape_string($link, $_POST['pass']); + + if (!$link) + { + $error = 'Cant connect to database.'; + include 'error.html.php'; + exit(); + } + if (!mysqli_set_charset($link, 'utf8')) + { + $error = 'Unable to set database connection encoding.'; + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database.'; + include 'error.html.php'; + exit(); + } + $loginresult = mysqli_query($link,"SELECT hash, attempts FROM accounts WHERE name = '$user';"); + if(!$loginresult) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + //lets put contents of accounts into an array + while($rowaccounts = mysqli_fetch_array($loginresult)) + { + $hash[] = $rowaccounts['hash']; + $attempts[] = $rowaccounts['attempts']; + } + if(password_verify($pass,$hash[0]) && $attempts[0] < 5) + { + if($attempts[0]>0) + { + if (!mysqli_query($link, "UPDATE accounts SET attempts = '0' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + $_SESSION["authenticated"] = true; + $_SESSION["user"] = $user; + $_SESSION["loadreview"]=true; + include 'review.php'; + exit(); + } + else{ + $attempt = $attempts[0] + 1; + if (!mysqli_query($link, "UPDATE accounts SET attempts = '$attempt' WHERE name = '$user';")) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + echo "It doesn't look like you submitted a valid username or password."; + include 'login.html.php'; + } + } +?> + diff --git a/html/review/login.html.php b/html/review/login.html.php new file mode 100755 index 0000000..18aa986 --- /dev/null +++ b/html/review/login.html.php @@ -0,0 +1,31 @@ + + + + + + + wiby.me + + + + + + + +
+ Username
+ Password

+ +
+ CAPTCHA Image +
+
+ + Reload Image +
+ +
+
+ + + diff --git a/html/review/review.php b/html/review/review.php new file mode 100755 index 0000000..bcb742b --- /dev/null +++ b/html/review/review.php @@ -0,0 +1,239 @@ += '".$startID."' AND id <= '".$endID."'"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + else + { + //check reviewqueue table for rows that are reserverd within reservetime. Do not select reserved rows. If reserved rows exceed 30mins, they can be reserved by different approver. + $result = mysqli_query($link,"SELECT * FROM reviewqueue WHERE reserved IS NULL OR reserved = '".$_SESSION["user"]."' OR reservetime < NOW() - INTERVAL 30 MINUTE LIMIT $lim"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + //lets put contents of reviewqueue into an array + while($row = mysqli_fetch_array($result)) + { + $id[] = $row['id']; + $url[] = str_replace("'", "%27", $row['url']); + $worksafe[] = $row['worksafe']; + } + + if ($_SERVER['REQUEST_METHOD'] === 'POST' && isset($_POST['startid']) && $_SESSION["loadreview"]==false) + { //store approved url list into indexqueue + $i=0; + $num_crawlers=1;//modify this variable to the number of crawlers you are using in parallel. + $crawler_id=1; + foreach($id as $pageid) + { + if($_POST["deny$pageid"] != 'on' && $_POST["skip$pageid"] != 'on' && $_POST["bury$pageid"] != 'on') + { + + $worksafe = mysqli_real_escape_string($link, $_POST["worksafe$pageid"]); + if($worksafe == 'on') + { + $worksafe = 1; + } + else + { + $worksafe = 0; + } + + if($_POST["surprise$pageid"] == 'on') + { + $surprise = 1; + } + else + { + $surprise = 0; + } + + if($_POST["forcerules$pageid"] == 'on') + { + $forcerules = 1; + } + else + { + $forcerules = 0; + } + + if($_POST["crawlrepeat$pageid"] == 'on') + { + $crawlrepeat = 1; + } + else + { + $crawlrepeat = 0; + } + + $updatable = $_POST["updatable$pageid"]; + $crawldepth = $_POST["crawldepth$pageid"]; + $crawlpages = $_POST["crawlpages$pageid"]; + $crawltype = $_POST["crawltype$pageid"]; + + + $sql = "INSERT INTO indexqueue (url,worksafe,approver,surprise,updatable,crawl_depth,crawl_pages,crawl_type,force_rules,crawl_repeat,crawler_id) VALUES ('".$url[$i]."','".$worksafe."','".$_SESSION["user"]."','".$surprise."','".$updatable."','".$crawldepth."','".$crawlpages."','".$crawltype."','".$forcerules."','".$crawlrepeat."','".$crawler_id."')"; + if (!mysqli_query($link, $sql)) + { + $error = 'Error inserting into indexqueue: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + if($_POST["bury$pageid"] == 'on' && $_POST["skip$pageid"] != 'on' && $_POST["deny$pageid"] != 'on') + { + + $worksafe = mysqli_real_escape_string($link, $_POST["worksafe$pageid"]); + if($worksafe == 'on') + { + $worksafe = 1; + } + else + { + $worksafe = 0; + } + $sql = "INSERT INTO graveyard (url,worksafe) VALUES ('".$url[$i]."','".$worksafe."')"; + if (!mysqli_query($link, $sql)) + { + $error = 'Error inserting into indexqueue: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + + } + //put denied pages into wibytemp rejected table + if($_POST["bury$pageid"] != 'on' && $_POST["skip$pageid"] != 'on' && $_POST["deny$pageid"] == 'on') + { + if(!mysqli_select_db($link, 'wibytemp')) + { + $error = 'Unable to locate the database.'. mysqli_error($link);; + include 'error.html.php'; + exit(); + } + $sql = "INSERT INTO rejected (url,user,date) VALUES ('".$url[$i]."','".$_SESSION["user"]."',now())"; + if (!mysqli_query($link, $sql)) + { + $error = 'Error inserting into indexqueue: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + if(!mysqli_select_db($link, 'wiby')) + { + $error = 'Unable to locate the database...'; + include 'error.html.php'; + exit(); + } + } + if($_POST["skip$pageid"] != 'on') + { + $result2 = mysqli_query($link,"DELETE FROM reviewqueue WHERE id = $pageid"); + if(!$result2) + { + $error = 'Error deleting from reviewqueue: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + $i++; + if($crawler_id == $num_crawlers){ + $crawler_id = 1; + }else{ + $crawler_id++; + } + } + + $_SESSION["loadreview"]=true; + unset($id); + unset($url); + unset($worksafe); + unset($startID); + unset($endID); + unset($result); + $link -> close(); + include 'review.php'; + //include 'refresh.html'; + exit(); + } + else + { + $_SESSION["loadreview"]=false; + //insert approver into reserved, reservetime will autoupdate, so that they cannot be taken by a different approver for 30 mins. + foreach($id as $pageid) + { + $result = mysqli_query($link,"UPDATE reviewqueue SET reserved = '".$_SESSION["user"]."' WHERE id = $pageid"); + if(!$result) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + } + + //get total number of rows remaining in queue + $totalrows = mysqli_query($link,"select count(id) from reviewqueue"); + if(!$totalrows) + { + $error = 'Error fetching index: ' . mysqli_error($link); + include 'error.html.php'; + exit(); + } + //get result of total rows remaining in queue + while($row = mysqli_fetch_array($totalrows)) + { + $queuesize = $row['count(id)']; + echo $queuesize . " pages queued in total."; + } + + include 'reviewqueue.html.php'; + } +?> + + diff --git a/html/review/reviewqueue.html.php b/html/review/reviewqueue.html.php new file mode 100755 index 0000000..f27eb3c --- /dev/null +++ b/html/review/reviewqueue.html.php @@ -0,0 +1,66 @@ + + + + + + + Awaiting Approval + + + + + + + + +
+

Some pages awaiting review:

+ + + + +

+ +
+ + [Worksafe] + + [Worksafe] + + [Surprise] + [Skip] + [Bury] + [Deny] + [Updatable] + + [Crawl: Depth + Pages + Type + Enforce Rules + Repeat] +

+ + +
+ +
+ + +
+ + + diff --git a/html/securimage/AHGBold.ttf b/html/securimage/AHGBold.ttf new file mode 100755 index 0000000..764b23d Binary files /dev/null and b/html/securimage/AHGBold.ttf differ diff --git a/html/securimage/LICENSE.txt b/html/securimage/LICENSE.txt new file mode 100755 index 0000000..889bc2c --- /dev/null +++ b/html/securimage/LICENSE.txt @@ -0,0 +1,25 @@ +COPYRIGHT: + Copyright (c) 2011 Drew Phillips + All rights reserved. + + Redistribution and use in source and binary forms, with or without modification, + are permitted provided that the following conditions are met: + + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + diff --git a/html/securimage/README.FONT.txt b/html/securimage/README.FONT.txt new file mode 100755 index 0000000..d4770de --- /dev/null +++ b/html/securimage/README.FONT.txt @@ -0,0 +1,12 @@ +AHGBold.ttf is used by Securimage under the following license: + +Alte Haas Grotesk is a typeface that look like an helvetica printed in an old Muller-Brockmann Book. + +These fonts are freeware and can be distributed as long as they are +together with this text file. + +I would appreciate very much to see what you have done with it anyway. + +yann le coroller +www.yannlecoroller.com +yann@lecoroller.com \ No newline at end of file diff --git a/html/securimage/README.md b/html/securimage/README.md new file mode 100755 index 0000000..a26bac4 --- /dev/null +++ b/html/securimage/README.md @@ -0,0 +1,244 @@ +## Name: + +**Securimage** - A PHP class for creating captcha images and audio with many options. + +## Version: + +**3.6.7** + +## Author: + +Drew Phillips + +## Download: + +The latest version can always be found at [phpcaptcha.org](https://www.phpcaptcha.org) + +## Documentation: + +Online documentation of the class, methods, and variables can be found +at http://www.phpcaptcha.org/Securimage_Docs/ + +## Requirements: + +* PHP 5.4 or greater +* GD 2.0 +* FreeType (Required, for TTF fonts) +* PDO (if using Sqlite, MySQL, or PostgreSQL) + +## Synopsis: + +**Within your HTML form** + +
+ .. form elements + +
+ +
+
+ + +**Within your PHP form processor** + + require_once 'securimage.php'; + + // Code Validation + + $image = new Securimage(); + if ($image->check($_POST['captcha_code']) == true) { + echo "Correct!"; + } else { + echo "Sorry, wrong code."; + } + +## Description: + +What is **Securimage**? + +Securimage is a PHP class that is used to generate and validate CAPTCHA images. + +The classes uses an existing PHP session or creates its own if none is found to +store the CAPTCHA code. In addition, a database can be used instead of +session storage. + +Variables within the class are used to control the style and display of the +image. The class uses TTF fonts and effects for strengthening the security of +the image. + +It also creates audible codes which are played for visually impared users. + +## UPGRADE NOTICE: + +**3.6.3 and below:** +Securimage 3.6.4 fixed a XSS vulnerability in example_form.ajax.php. It is +recommended to upgrade to the latest version or delete example_form.ajax.php +from the securimage directory on your website. + +**3.6.2 and above:** + +If you are upgrading to 3.6.2 or greater *AND* are using database storage, +the table structure has changed in 3.6.2 adding an audio_data column for +storing audio files in the database in order to support HTTP range +requests. Delete your tables and have Securimage recreate them or see +the function createDatabaseTables() in securimage.php for the new structure +depending on which database backend you are using and alter the tables as +needed. If using SQLite, just overwrite your existing securimage.sq3 file +with the one from this release. + +*If you are not using database tables for storage, ignore this notice.* + +## Copyright: +Script + Copyright (c) 2018 Drew Phillips + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +## Licenses: + +**WavFile.php** + + The WavFile.php class used in Securimage by Drew Phillips and Paul Voegler + is used under the BSD License. See WavFile.php for details. + Many thanks to Paul Voegler (http://www.voegler.eu/) for contributing to + Securimage. +Script +--------------------------------------------------------------------------- + +**Flash code for Securimage** + +Flash code created by Age Bosma & Mario Romero (animario@hotmail.com) +Many thanks for releasing this to the project! + +--------------------------------------------------------------------------- + +**HKCaptcha** + +Portions of Securimage contain code from Han-Kwang Nienhuys' PHP captcha + + Han-Kwang Nienhuys' PHP captcha + Copyright June 2007 + + This copyright message and attribution must be preserved upon + modification. Redistribution under other licenses is expressly allowed. + Other licenses include GPL 2 or higher, BSD, and non-free licenses. + The original, unrestricted version can be obtained from + http://www.lagom.nl/linux/hkcaptcha/ + +--------------------------------------------------------------------------- + +**AHGBold.ttf** + + AHGBold.ttf (AlteHaasGroteskBold.ttf) font was created by Yann Le Coroller + and is distributed as freeware. + + Alte Haas Grotesk is a typeface that look like an helvetica printed in an + old Muller-Brockmann Book. + + These fonts are freeware and can be distributed as long as they are + together with this text file. + + I would appreciate very much to see what you have done with it anyway. + + yann le coroller + www.yannlecoroller.com + yann@lecoroller.com + +--------------------------------------------------------------------------- + +**PopForge Flash Library** + +Portions of securimage_play.swf use the PopForge flash library for playing audio + + /** + * Copyright(C) 2007 Andre Michelle and Joa Ebert + * + * PopForge is an ActionScript3 code sandbox developed by Andre Michelle + * and Joa Ebert + * http://sandbox.popforge.de + * + * PopforgeAS3Audio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * PopforgeAS3Audio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see + */ + +-------------------------------------------------------------------------- + +**Graphics** + +Some graphics used are from the Humility Icon Pack by WorLord + + License: GNU/GPL (http://findicons.com/pack/1723/humility) + http://findicons.com/icon/192558/gnome_volume_control + http://findicons.com/icon/192562/gtk_refresh + +-------------------------------------------------------------------------- + + +**Background noise sound files are from SoundJay.com** + +http://www.soundjay.com/tos.html + + All sound effects on this website are created by us and protected under + the copyright laws, international treaty provisions and other applicable + laws. By downloading sounds, music or any material from this site implies + that you have read and accepted these terms and conditions: + + Sound Effects + You are allowed to use the sounds free of charge and royalty free in your + projects (such as films, videos, games, presentations, animations, stage + plays, radio plays, audio books, apps) be it for commercial or + non-commercial purposes. + + But you are NOT allowed to + - post the sounds (as sound effects or ringtones) on any website for + others to download, copy or use + - use them as a raw material to create sound effects or ringtones that + you will sell, distribute or offer for downloading + - sell, re-sell, license or re-license the sounds (as individual sound + effects or as a sound effects library) to anyone else + - claim the sounds as yours + - link directly to individual sound files + - distribute the sounds in apps or computer programs that are clearly + sound related in nature (such as sound machine, sound effect + generator, ringtone maker, funny sounds app, sound therapy app, etc.) + or in apps or computer programs that use the sounds as the program's + sound resource library for other people's use (such as animation + creator, digital book creator, song maker software, etc.). If you are + developing such computer programs, contact us for licensing options. + + If you use the sound effects, please consider giving us a credit and + linking back to us but it's not required. + diff --git a/html/securimage/README.txt b/html/securimage/README.txt new file mode 100755 index 0000000..a50a98c --- /dev/null +++ b/html/securimage/README.txt @@ -0,0 +1,222 @@ +NAME: + + Securimage - A PHP class for creating captcha images and audio with many options. + +VERSION: + + 3.6.7 + +AUTHOR: + + Drew Phillips + +DOWNLOAD: + + The latest version can always be + found at http://www.phpcaptcha.org + +DOCUMENTATION: + + Online documentation of the class, methods, and variables can + be found at http://www.phpcaptcha.org/Securimage_Docs/ + +REQUIREMENTS: + + PHP 5.4 or greater + GD 2.0 + FreeType (Required, for TTF fonts) + PDO (if using Sqlite, MySQL, or PostgreSQL) + +SYNOPSIS: + + require_once 'securimage.php'; + + **Within your HTML form** + +
+ .. form elements + +
+ +
+
+ + + **Within your PHP form processor** + + // Code Validation + + $image = new Securimage(); + if ($image->check($_POST['captcha_code']) == true) { + echo "Correct!"; + } else { + echo "Sorry, wrong code."; + } + +DESCRIPTION: + + What is Securimage? + + Securimage is a PHP class that is used to generate and validate CAPTCHA + images. + + The classes uses an existing PHP session or creates its own if + none is found to store the CAPTCHA code. In addition, a database can be + used instead of session storage. + + Variables within the class are used to control the style and display of + the image. The class uses TTF fonts and effects for strengthening the + security of the image. + + It also creates audible codes which are played for visually impared users. + +UPGRADE NOTICE: + 3.6.3 and below: + Securimage 3.6.4 fixed a XSS vulnerability in example_form.ajax.php. It is + recommended to upgrade to the latest version or delete example_form.ajax.php + from the securimage directory on your website. + + 3.6.2 and above: + If you are upgrading to 3.6.2 or greater AND are using database storage, + the table structure has changed in 3.6.2 adding an audio_data column for + storing audio files in the database in order to support HTTP range + requests. Delete your tables and have Securimage recreate them or see + the function createDatabaseTables() in securimage.php for the new structure + depending on which database backend you are using and alter the tables as + needed. If using SQLite, just overwrite your existing securimage.sq3 file + with the one from this release. + + If you are not using database tables for storage, ignore this notice. + +COPYRIGHT: + + Copyright (c) 2018 Drew Phillips + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + - Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + - Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + POSSIBILITY OF SUCH DAMAGE. + +LICENSES: + + The WavFile.php class used in Securimage by Drew Phillips and Paul Voegler + is used under the BSD License. See WavFile.php for details. + Many thanks to Paul Voegler (http://www.voegler.eu/) for contributing to + Securimage. + + --------------------------------------------------------------------------- + Flash code created by Age Bosma & Mario Romero (animario@hotmail.com) + Many thanks for releasing this to the project! + + --------------------------------------------------------------------------- + Portions of Securimage contain code from Han-Kwang Nienhuys' PHP captcha + + Han-Kwang Nienhuys' PHP captcha + Copyright June 2007 + + This copyright message and attribution must be preserved upon + modification. Redistribution under other licenses is expressly allowed. + Other licenses include GPL 2 or higher, BSD, and non-free licenses. + The original, unrestricted version can be obtained from + http://www.lagom.nl/linux/hkcaptcha/ + + --------------------------------------------------------------------------- + AHGBold.ttf (AlteHaasGroteskBold.ttf) font was created by Yann Le Coroller + and is distributed as freeware. + + Alte Haas Grotesk is a typeface that look like an helvetica printed in an + old Muller-Brockmann Book. + + These fonts are freeware and can be distributed as long as they are + together with this text file. + + I would appreciate very much to see what you have done with it anyway. + + yann le coroller + www.yannlecoroller.com + yann@lecoroller.com + + --------------------------------------------------------------------------- + Portions of securimage_play.swf use the PopForge flash library for + playing audio + + /** + * Copyright(C) 2007 Andre Michelle and Joa Ebert + * + * PopForge is an ActionScript3 code sandbox developed by Andre Michelle + * and Joa Ebert + * http://sandbox.popforge.de + * + * PopforgeAS3Audio is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * PopforgeAS3Audio is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see + */ + + -------------------------------------------------------------------------- + Some graphics used are from the Humility Icon Pack by WorLord + + License: GNU/GPL (http://findicons.com/pack/1723/humility) + http://findicons.com/icon/192558/gnome_volume_control + http://findicons.com/icon/192562/gtk_refresh + + -------------------------------------------------------------------------- + Background noise sound files are from SoundJay.com + http://www.soundjay.com/tos.html + + All sound effects on this website are created by us and protected under + the copyright laws, international treaty provisions and other applicable + laws. By downloading sounds, music or any material from this site implies + that you have read and accepted these terms and conditions: + + Sound Effects + You are allowed to use the sounds free of charge and royalty free in your + projects (such as films, videos, games, presentations, animations, stage + plays, radio plays, audio books, apps) be it for commercial or + non-commercial purposes. + + But you are NOT allowed to + - post the sounds (as sound effects or ringtones) on any website for + others to download, copy or use + - use them as a raw material to create sound effects or ringtones that + you will sell, distribute or offer for downloading + - sell, re-sell, license or re-license the sounds (as individual sound + effects or as a sound effects library) to anyone else + - claim the sounds as yours + - link directly to individual sound files + - distribute the sounds in apps or computer programs that are clearly + sound related in nature (such as sound machine, sound effect + generator, ringtone maker, funny sounds app, sound therapy app, etc.) + or in apps or computer programs that use the sounds as the program's + sound resource library for other people's use (such as animation + creator, digital book creator, song maker software, etc.). If you are + developing such computer programs, contact us for licensing options. + + If you use the sound effects, please consider giving us a credit and + linking back to us but it's not required. + diff --git a/html/securimage/WavFile.php b/html/securimage/WavFile.php new file mode 100755 index 0000000..8702d22 --- /dev/null +++ b/html/securimage/WavFile.php @@ -0,0 +1,1913 @@ + +* File: WavFile.php
+* +* Copyright (c) 2014, Drew Phillips +* All rights reserved. +* +* Redistribution and use in source and binary forms, with or without modification, +* are permitted provided that the following conditions are met: +* +* - Redistributions of source code must retain the above copyright notice, +* this list of conditions and the following disclaimer. +* - Redistributions in binary form must reproduce the above copyright notice, +* this list of conditions and the following disclaimer in the documentation +* and/or other materials provided with the distribution. +* +* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE +* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +* POSSIBILITY OF SUCH DAMAGE. +* +* Any modifications to the library should be indicated clearly in the source code +* to inform users that the changes are not a part of the original software.

+* +* @copyright 2014 Drew Phillips +* @author Drew Phillips +* @author Paul Voegler +* @version 1.1.1 (Sep 2015) +* @package PHPWavUtils +* @license BSD License +* +* Changelog: +* 1.1.1 (09/08/2015) +* - Fix degrade() method to call filter correctly (Rasmus Lerdorf) +* +* 1.1 (02/8/2014) +* - Add method setIgnoreChunkSizes() to allow reading of wav data with bogus chunk sizes set. +* This allows streamed wav data to be processed where the chunk sizes were not known when +* writing the header. Instead calculates the chunk sizes automatically. +* - Add simple volume filter to attenuate or amplify the audio signal. +* +* 1.0 (10/2/2012) +* - Fix insertSilence() creating invalid block size +* +* 1.0 RC1 (4/20/2012) +* - Initial release candidate +* - Supports 8, 16, 24, 32 bit PCM, 32-bit IEEE FLOAT, Extensible Format +* - Support for 18 channels of audio +* - Ability to read an offset from a file to reduce memory footprint with large files +* - Single-pass audio filter processing +* - Highly accurate and efficient mix and normalization filters (http://www.voegler.eu/pub/audio/) +* - Utility filters for degrading audio, and inserting silence +* +* 0.6 (4/12/2012) +* - Support 8, 16, 24, 32 bit and PCM float (Paul Voegler) +* - Add normalize filter, misc improvements and fixes (Paul Voegler) +* - Normalize parameters to filter() to use filter constants as array indices +* - Add option to mix filter to loop the target file if the source is longer +* +* 0.5 (4/3/2012) +* - Fix binary pack routine (Paul Voegler) +* - Add improved mixing function (Paul Voegler) +* +*/ + +class WavFile +{ + /*%******************************************************************************************%*/ + // Class constants + + /** @var int Filter flag for mixing two files */ + const FILTER_MIX = 0x01; + + /** @var int Filter flag for normalizing audio data */ + const FILTER_NORMALIZE = 0x02; + + /** @var int Filter flag for degrading audio data */ + const FILTER_DEGRADE = 0x04; + + /** @var int Filter flag for amplifying or attenuating audio data. */ + const FILTER_VOLUME = 0x08; + + /** @var int Maximum number of channels */ + const MAX_CHANNEL = 18; + + /** @var int Maximum sample rate */ + const MAX_SAMPLERATE = 192000; + + /** Channel Locations for ChannelMask */ + const SPEAKER_DEFAULT = 0x000000; + const SPEAKER_FRONT_LEFT = 0x000001; + const SPEAKER_FRONT_RIGHT = 0x000002; + const SPEAKER_FRONT_CENTER = 0x000004; + const SPEAKER_LOW_FREQUENCY = 0x000008; + const SPEAKER_BACK_LEFT = 0x000010; + const SPEAKER_BACK_RIGHT = 0x000020; + const SPEAKER_FRONT_LEFT_OF_CENTER = 0x000040; + const SPEAKER_FRONT_RIGHT_OF_CENTER = 0x000080; + const SPEAKER_BACK_CENTER = 0x000100; + const SPEAKER_SIDE_LEFT = 0x000200; + const SPEAKER_SIDE_RIGHT = 0x000400; + const SPEAKER_TOP_CENTER = 0x000800; + const SPEAKER_TOP_FRONT_LEFT = 0x001000; + const SPEAKER_TOP_FRONT_CENTER = 0x002000; + const SPEAKER_TOP_FRONT_RIGHT = 0x004000; + const SPEAKER_TOP_BACK_LEFT = 0x008000; + const SPEAKER_TOP_BACK_CENTER = 0x010000; + const SPEAKER_TOP_BACK_RIGHT = 0x020000; + const SPEAKER_ALL = 0x03FFFF; + + /** @var int PCM Audio Format */ + const WAVE_FORMAT_PCM = 0x0001; + + /** @var int IEEE FLOAT Audio Format */ + const WAVE_FORMAT_IEEE_FLOAT = 0x0003; + + /** @var int EXTENSIBLE Audio Format - actual audio format defined by SubFormat */ + const WAVE_FORMAT_EXTENSIBLE = 0xFFFE; + + /** @var string PCM Audio Format SubType - LE hex representation of GUID {00000001-0000-0010-8000-00AA00389B71} */ + const WAVE_SUBFORMAT_PCM = "0100000000001000800000aa00389b71"; + + /** @var string IEEE FLOAT Audio Format SubType - LE hex representation of GUID {00000003-0000-0010-8000-00AA00389B71} */ + const WAVE_SUBFORMAT_IEEE_FLOAT = "0300000000001000800000aa00389b71"; + + + /*%******************************************************************************************%*/ + // Properties + + /** @var array Log base modifier lookup table for a given threshold (in 0.05 steps) used by normalizeSample. + * Adjusts the slope (1st derivative) of the log function at the threshold to 1 for a smooth transition + * from linear to logarithmic amplitude output. */ + protected static $LOOKUP_LOGBASE = array( + 2.513, 2.667, 2.841, 3.038, 3.262, + 3.520, 3.819, 4.171, 4.589, 5.093, + 5.711, 6.487, 7.483, 8.806, 10.634, + 13.302, 17.510, 24.970, 41.155, 96.088 + ); + + /** @var int The actual physical file size */ + protected $_actualSize; + + /** @var int The size of the file in RIFF header */ + protected $_chunkSize; + + /** @var int The size of the "fmt " chunk */ + protected $_fmtChunkSize; + + /** @var int The size of the extended "fmt " data */ + protected $_fmtExtendedSize; + + /** @var int The size of the "fact" chunk */ + protected $_factChunkSize; + + /** @var int Size of the data chunk */ + protected $_dataSize; + + /** @var int Size of the data chunk in the opened wav file */ + protected $_dataSize_fp; + + /** @var bool Does _dataSize really reflect strlen($_samples)? Case when a wav file is read with readData = false */ + protected $_dataSize_valid; + + /** @var int Starting offset of data chunk */ + protected $_dataOffset; + + /** @var int The audio format - WavFile::WAVE_FORMAT_* */ + protected $_audioFormat; + + /** @var int|string|null The audio subformat - WavFile::WAVE_SUBFORMAT_* */ + protected $_audioSubFormat; + + /** @var int Number of channels in the audio file */ + protected $_numChannels; + + /** @var int The channel mask */ + protected $_channelMask; + + /** @var int Samples per second */ + protected $_sampleRate; + + /** @var int Number of bits per sample */ + protected $_bitsPerSample; + + /** @var int Number of valid bits per sample */ + protected $_validBitsPerSample; + + /** @var int NumChannels * BitsPerSample/8 */ + protected $_blockAlign; + + /** @var int Number of sample blocks */ + protected $_numBlocks; + + /** @var int Bytes per second */ + protected $_byteRate; + + /** @var bool Ignore chunk sizes when reading wav data (useful when reading data from a stream where chunk sizes contain dummy values) */ + protected $_ignoreChunkSizes; + + /** @var string Binary string of samples */ + protected $_samples; + + /** @var resource|null The file pointer used for reading wavs from file or memory */ + protected $_fp; + + + /*%******************************************************************************************%*/ + // Special methods + + /** + * WavFile Constructor. + * + * + * $wav1 = new WavFile(2, 44100, 16); // new wav with 2 channels, at 44100 samples/sec and 16 bits per sample + * $wav2 = new WavFile('./audio/sound.wav'); // open and read wav file + * + * + * @param string|int $numChannelsOrFileName (Optional) If string, the filename of the wav file to open. The number of channels otherwise. Defaults to 1. + * @param int|bool $sampleRateOrReadData (Optional) If opening a file and boolean, decides whether to read the data chunk or not. Defaults to true. The sample rate in samples per second otherwise. 8000 = standard telephone, 16000 = wideband telephone, 32000 = FM radio and 44100 = CD quality. Defaults to 8000. + * @param int $bitsPerSample (Optional) The number of bits per sample. Has to be 8, 16 or 24 for PCM audio or 32 for IEEE FLOAT audio. 8 = telephone, 16 = CD and 24 or 32 = studio quality. Defaults to 8. + * @throws WavFormatException + * @throws WavFileException + */ + public function __construct($numChannelsOrFileName = null, $sampleRateOrReadData = null, $bitsPerSample = null) + { + $this->_actualSize = 44; + $this->_chunkSize = 36; + $this->_fmtChunkSize = 16; + $this->_fmtExtendedSize = 0; + $this->_factChunkSize = 0; + $this->_dataSize = 0; + $this->_dataSize_fp = 0; + $this->_dataSize_valid = true; + $this->_dataOffset = 44; + $this->_audioFormat = self::WAVE_FORMAT_PCM; + $this->_audioSubFormat = null; + $this->_numChannels = 1; + $this->_channelMask = self::SPEAKER_DEFAULT; + $this->_sampleRate = 8000; + $this->_bitsPerSample = 8; + $this->_validBitsPerSample = 8; + $this->_blockAlign = 1; + $this->_numBlocks = 0; + $this->_byteRate = 8000; + $this->_ignoreChunkSizes = false; + $this->_samples = ''; + $this->_fp = null; + + + if (is_string($numChannelsOrFileName)) { + $this->openWav($numChannelsOrFileName, is_bool($sampleRateOrReadData) ? $sampleRateOrReadData : true); + + } else { + $this->setNumChannels(is_null($numChannelsOrFileName) ? 1 : $numChannelsOrFileName) + ->setSampleRate(is_null($sampleRateOrReadData) ? 8000 : $sampleRateOrReadData) + ->setBitsPerSample(is_null($bitsPerSample) ? 8 : $bitsPerSample); + } + } + + public function __destruct() { + if (is_resource($this->_fp)) $this->closeWav(); + } + + public function __clone() { + $this->_fp = null; + } + + /** + * Output the wav file headers and data. + * + * @return string The encoded file. + */ + public function __toString() + { + return $this->makeHeader() . + $this->getDataSubchunk(); + } + + + /*%******************************************************************************************%*/ + // Static methods + + /** + * Unpacks a single binary sample to numeric value. + * + * @param string $sampleBinary (Required) The sample to decode. + * @param int $bitDepth (Optional) The bits per sample to decode. If omitted, derives it from the length of $sampleBinary. + * @return int|float|null The numeric sample value. Float for 32-bit samples. Returns null for unsupported bit depths. + */ + public static function unpackSample($sampleBinary, $bitDepth = null) + { + if ($bitDepth === null) { + $bitDepth = strlen($sampleBinary) * 8; + } + + switch ($bitDepth) { + case 8: + // unsigned char + return ord($sampleBinary); + + case 16: + // signed short, little endian + $data = unpack('v', $sampleBinary); + $sample = $data[1]; + if ($sample >= 0x8000) { + $sample -= 0x10000; + } + return $sample; + + case 24: + // 3 byte packed signed integer, little endian + $data = unpack('C3', $sampleBinary); + $sample = $data[1] | ($data[2] << 8) | ($data[3] << 16); + if ($sample >= 0x800000) { + $sample -= 0x1000000; + } + return $sample; + + case 32: + // 32-bit float + $data = unpack('f', $sampleBinary); + return $data[1]; + + default: + return null; + } + } + + /** + * Packs a single numeric sample to binary. + * + * @param int|float $sample (Required) The sample to encode. Has to be within valid range for $bitDepth. Float values only for 32 bits. + * @param int $bitDepth (Required) The bits per sample to encode with. + * @return string|null The encoded binary sample. Returns null for unsupported bit depths. + */ + public static function packSample($sample, $bitDepth) + { + switch ($bitDepth) { + case 8: + // unsigned char + return chr($sample); + + case 16: + // signed short, little endian + if ($sample < 0) { + $sample += 0x10000; + } + return pack('v', $sample); + + case 24: + // 3 byte packed signed integer, little endian + if ($sample < 0) { + $sample += 0x1000000; + } + return pack('C3', $sample & 0xff, ($sample >> 8) & 0xff, ($sample >> 16) & 0xff); + + case 32: + // 32-bit float + return pack('f', $sample); + + default: + return null; + } + } + + /** + * Unpacks a binary sample block to numeric values. + * + * @param string $sampleBlock (Required) The binary sample block (all channels). + * @param int $bitDepth (Required) The bits per sample to decode. + * @param int $numChannels (Optional) The number of channels to decode. If omitted, derives it from the length of $sampleBlock and $bitDepth. + * @return array The sample values as an array of integers of floats for 32 bits. First channel is array index 1. + */ + public static function unpackSampleBlock($sampleBlock, $bitDepth, $numChannels = null) { + $sampleBytes = $bitDepth / 8; + if ($numChannels === null) { + $numChannels = strlen($sampleBlock) / $sampleBytes; + } + + $samples = array(); + for ($i = 0; $i < $numChannels; $i++) { + $sampleBinary = substr($sampleBlock, $i * $sampleBytes, $sampleBytes); + $samples[$i + 1] = self::unpackSample($sampleBinary, $bitDepth); + } + + return $samples; + } + + /** + * Packs an array of numeric channel samples to a binary sample block. + * + * @param array $samples (Required) The array of channel sample values. Expects float values for 32 bits and integer otherwise. + * @param int $bitDepth (Required) The bits per sample to encode with. + * @return string The encoded binary sample block. + */ + public static function packSampleBlock($samples, $bitDepth) { + $sampleBlock = ''; + foreach($samples as $sample) { + $sampleBlock .= self::packSample($sample, $bitDepth); + } + + return $sampleBlock; + } + + /** + * Normalizes a float audio sample. Maximum input range assumed for compression is [-2, 2]. + * See http://www.voegler.eu/pub/audio/ for more information. + * + * @param float $sampleFloat (Required) The float sample to normalize. + * @param float $threshold (Required) The threshold or gain factor for normalizing the amplitude.
    + *
  • >= 1 - Normalize by multiplying by the threshold (boost - positive gain).
    + * A value of 1 in effect means no normalization (and results in clipping).
  • + *
  • <= -1 - Normalize by dividing by the the absolute value of threshold (attenuate - negative gain).
    + * A factor of 2 (-2) is about 6dB reduction in volume.
  • + *
  • [0, 1) - (open inverval - not including 1) - The threshold + * above which amplitudes are comressed logarithmically.
    + * e.g. 0.6 to leave amplitudes up to 60% "as is" and compress above.
  • + *
  • (-1, 0) - (open inverval - not including -1 and 0) - The threshold + * above which amplitudes are comressed linearly.
    + * e.g. -0.6 to leave amplitudes up to 60% "as is" and compress above.
+ * @return float The normalized sample. + **/ + public static function normalizeSample($sampleFloat, $threshold) { + // apply positive gain + if ($threshold >= 1) { + return $sampleFloat * $threshold; + } + + // apply negative gain + if ($threshold <= -1) { + return $sampleFloat / -$threshold; + } + + $sign = $sampleFloat < 0 ? -1 : 1; + $sampleAbs = abs($sampleFloat); + + // logarithmic compression + if ($threshold >= 0 && $threshold < 1 && $sampleAbs > $threshold) { + $loga = self::$LOOKUP_LOGBASE[(int)($threshold * 20)]; // log base modifier + return $sign * ($threshold + (1 - $threshold) * log(1 + $loga * ($sampleAbs - $threshold) / (2 - $threshold)) / log(1 + $loga)); + } + + // linear compression + $thresholdAbs = abs($threshold); + if ($threshold > -1 && $threshold < 0 && $sampleAbs > $thresholdAbs) { + return $sign * ($thresholdAbs + (1 - $thresholdAbs) / (2 - $thresholdAbs) * ($sampleAbs - $thresholdAbs)); + } + + // else ? + return $sampleFloat; + } + + + /*%******************************************************************************************%*/ + // Getter and Setter methods for properties + + public function getActualSize() { + return $this->_actualSize; + } + + /** @param int $actualSize */ + protected function setActualSize($actualSize = null) { + if (is_null($actualSize)) { + $this->_actualSize = 8 + $this->_chunkSize; // + "RIFF" header (ID + size) + } else { + $this->_actualSize = $actualSize; + } + + return $this; + } + + public function getChunkSize() { + return $this->_chunkSize; + } + + /** @param int $chunkSize */ + protected function setChunkSize($chunkSize = null) { + if (is_null($chunkSize)) { + $this->_chunkSize = 4 + // "WAVE" chunk + 8 + $this->_fmtChunkSize + // "fmt " subchunk + ($this->_factChunkSize > 0 ? 8 + $this->_factChunkSize : 0) + // "fact" subchunk + 8 + $this->_dataSize + // "data" subchunk + ($this->_dataSize & 1); // padding byte + } else { + $this->_chunkSize = $chunkSize; + } + + $this->setActualSize(); + + return $this; + } + + public function getFmtChunkSize() { + return $this->_fmtChunkSize; + } + + /** @param int $fmtChunkSize */ + protected function setFmtChunkSize($fmtChunkSize = null) { + if (is_null($fmtChunkSize)) { + $this->_fmtChunkSize = 16 + $this->_fmtExtendedSize; + } else { + $this->_fmtChunkSize = $fmtChunkSize; + } + + $this->setChunkSize() // implicit setActualSize() + ->setDataOffset(); + + return $this; + } + + public function getFmtExtendedSize() { + return $this->_fmtExtendedSize; + } + + /** @param int $fmtExtendedSize */ + protected function setFmtExtendedSize($fmtExtendedSize = null) { + if (is_null($fmtExtendedSize)) { + if ($this->_audioFormat == self::WAVE_FORMAT_EXTENSIBLE) { + $this->_fmtExtendedSize = 2 + 22; // extension size for WAVE_FORMAT_EXTENSIBLE + } elseif ($this->_audioFormat != self::WAVE_FORMAT_PCM) { + $this->_fmtExtendedSize = 2 + 0; // empty extension + } else { + $this->_fmtExtendedSize = 0; // no extension, only for WAVE_FORMAT_PCM + } + } else { + $this->_fmtExtendedSize = $fmtExtendedSize; + } + + $this->setFmtChunkSize(); // implicit setSize(), setActualSize(), setDataOffset() + + return $this; + } + + public function getFactChunkSize() { + return $this->_factChunkSize; + } + + /** @param int $factChunkSize */ + protected function setFactChunkSize($factChunkSize = null) { + if (is_null($factChunkSize)) { + if ($this->_audioFormat != self::WAVE_FORMAT_PCM) { + $this->_factChunkSize = 4; + } else { + $this->_factChunkSize = 0; + } + } else { + $this->_factChunkSize = $factChunkSize; + } + + $this->setChunkSize() // implicit setActualSize() + ->setDataOffset(); + + return $this; + } + + public function getDataSize() { + return $this->_dataSize; + } + + /** @param int $dataSize */ + protected function setDataSize($dataSize = null) { + if (is_null($dataSize)) { + $this->_dataSize = strlen($this->_samples); + } else { + $this->_dataSize = $dataSize; + } + + $this->setChunkSize() // implicit setActualSize() + ->setNumBlocks(); + $this->_dataSize_valid = true; + + return $this; + } + + public function getDataOffset() { + return $this->_dataOffset; + } + + /** @param int $dataOffset */ + protected function setDataOffset($dataOffset = null) { + if (is_null($dataOffset)) { + $this->_dataOffset = 8 + // "RIFF" header (ID + size) + 4 + // "WAVE" chunk + 8 + $this->_fmtChunkSize + // "fmt " subchunk + ($this->_factChunkSize > 0 ? 8 + $this->_factChunkSize : 0) + // "fact" subchunk + 8; // "data" subchunk + } else { + $this->_dataOffset = $dataOffset; + } + + return $this; + } + + public function getAudioFormat() { + return $this->_audioFormat; + } + + /** @param int $audioFormat */ + protected function setAudioFormat($audioFormat = null) { + if (is_null($audioFormat)) { + if (($this->_bitsPerSample <= 16 || $this->_bitsPerSample == 32) + && $this->_validBitsPerSample == $this->_bitsPerSample + && $this->_channelMask == self::SPEAKER_DEFAULT + && $this->_numChannels <= 2) { + if ($this->_bitsPerSample <= 16) { + $this->_audioFormat = self::WAVE_FORMAT_PCM; + } else { + $this->_audioFormat = self::WAVE_FORMAT_IEEE_FLOAT; + } + } else { + $this->_audioFormat = self::WAVE_FORMAT_EXTENSIBLE; + } + } else { + $this->_audioFormat = $audioFormat; + } + + $this->setAudioSubFormat() + ->setFactChunkSize() // implicit setSize(), setActualSize(), setDataOffset() + ->setFmtExtendedSize(); // implicit setFmtChunkSize(), setSize(), setActualSize(), setDataOffset() + + return $this; + } + + public function getAudioSubFormat() { + return $this->_audioSubFormat; + } + + /** @param int $audioSubFormat */ + protected function setAudioSubFormat($audioSubFormat = null) { + if (is_null($audioSubFormat)) { + if ($this->_bitsPerSample == 32) { + $this->_audioSubFormat = self::WAVE_SUBFORMAT_IEEE_FLOAT; // 32 bits are IEEE FLOAT in this class + } else { + $this->_audioSubFormat = self::WAVE_SUBFORMAT_PCM; // 8, 16 and 24 bits are PCM in this class + } + } else { + $this->_audioSubFormat = $audioSubFormat; + } + + return $this; + } + + public function getNumChannels() { + return $this->_numChannels; + } + + /** @param int $numChannels */ + public function setNumChannels($numChannels) { + if ($numChannels < 1 || $numChannels > self::MAX_CHANNEL) { + throw new WavFileException('Unsupported number of channels. Only up to ' . self::MAX_CHANNEL . ' channels are supported.'); + } elseif ($this->_samples !== '') { + trigger_error('Wav already has sample data. Changing the number of channels does not convert and may corrupt the data.', E_USER_NOTICE); + } + + $this->_numChannels = (int)$numChannels; + + $this->setAudioFormat() // implicit setAudioSubFormat(), setFactChunkSize(), setFmtExtendedSize(), setFmtChunkSize(), setSize(), setActualSize(), setDataOffset() + ->setByteRate() + ->setBlockAlign(); // implicit setNumBlocks() + + return $this; + } + + public function getChannelMask() { + return $this->_channelMask; + } + + public function setChannelMask($channelMask = self::SPEAKER_DEFAULT) { + if ($channelMask != 0) { + // count number of set bits - Hamming weight + $c = (int)$channelMask; + $n = 0; + while ($c > 0) { + $n += $c & 1; + $c >>= 1; + } + if ($n != $this->_numChannels || (((int)$channelMask | self::SPEAKER_ALL) != self::SPEAKER_ALL)) { + throw new WavFileException('Invalid channel mask. The number of channels does not match the number of locations in the mask.'); + } + } + + $this->_channelMask = (int)$channelMask; + + $this->setAudioFormat(); // implicit setAudioSubFormat(), setFactChunkSize(), setFmtExtendedSize(), setFmtChunkSize(), setSize(), setActualSize(), setDataOffset() + + return $this; + } + + public function getSampleRate() { + return $this->_sampleRate; + } + + public function setSampleRate($sampleRate) { + if ($sampleRate < 1 || $sampleRate > self::MAX_SAMPLERATE) { + throw new WavFileException('Invalid sample rate.'); + } elseif ($this->_samples !== '') { + trigger_error('Wav already has sample data. Changing the sample rate does not convert the data and may yield undesired results.', E_USER_NOTICE); + } + + $this->_sampleRate = (int)$sampleRate; + + $this->setByteRate(); + + return $this; + } + + public function getBitsPerSample() { + return $this->_bitsPerSample; + } + + public function setBitsPerSample($bitsPerSample) { + if (!in_array($bitsPerSample, array(8, 16, 24, 32))) { + throw new WavFileException('Unsupported bits per sample. Only 8, 16, 24 and 32 bits are supported.'); + } elseif ($this->_samples !== '') { + trigger_error('Wav already has sample data. Changing the bits per sample does not convert and may corrupt the data.', E_USER_NOTICE); + } + + $this->_bitsPerSample = (int)$bitsPerSample; + + $this->setValidBitsPerSample() // implicit setAudioFormat(), setAudioSubFormat(), setFmtChunkSize(), setFactChunkSize(), setSize(), setActualSize(), setDataOffset() + ->setByteRate() + ->setBlockAlign(); // implicit setNumBlocks() + + return $this; + } + + public function getValidBitsPerSample() { + return $this->_validBitsPerSample; + } + + protected function setValidBitsPerSample($validBitsPerSample = null) { + if (is_null($validBitsPerSample)) { + $this->_validBitsPerSample = $this->_bitsPerSample; + } else { + if ($validBitsPerSample < 1 || $validBitsPerSample > $this->_bitsPerSample) { + throw new WavFileException('ValidBitsPerSample cannot be greater than BitsPerSample.'); + } + $this->_validBitsPerSample = (int)$validBitsPerSample; + } + + $this->setAudioFormat(); // implicit setAudioSubFormat(), setFactChunkSize(), setFmtExtendedSize(), setFmtChunkSize(), setSize(), setActualSize(), setDataOffset() + + return $this; + } + + public function getBlockAlign() { + return $this->_blockAlign; + } + + /** @param int $blockAlign */ + protected function setBlockAlign($blockAlign = null) { + if (is_null($blockAlign)) { + $this->_blockAlign = $this->_numChannels * $this->_bitsPerSample / 8; + } else { + $this->_blockAlign = $blockAlign; + } + + $this->setNumBlocks(); + + return $this; + } + + public function getNumBlocks() + { + return $this->_numBlocks; + } + + /** @param int $numBlocks */ + protected function setNumBlocks($numBlocks = null) { + if (is_null($numBlocks)) { + $this->_numBlocks = (int)($this->_dataSize / $this->_blockAlign); // do not count incomplete sample blocks + } else { + $this->_numBlocks = $numBlocks; + } + + return $this; + } + + public function getByteRate() { + return $this->_byteRate; + } + + /** @param int $byteRate */ + protected function setByteRate($byteRate = null) { + if (is_null($byteRate)) { + $this->_byteRate = $this->_sampleRate * $this->_numChannels * $this->_bitsPerSample / 8; + } else { + $this->_byteRate = $byteRate; + } + + return $this; + } + + public function getIgnoreChunkSizes() + { + return $this->_ignoreChunkSizes; + } + + public function setIgnoreChunkSizes($ignoreChunkSizes) + { + $this->_ignoreChunkSizes = (bool)$ignoreChunkSizes; + return $this; + } + + public function getSamples() { + return $this->_samples; + } + + public function setSamples(&$samples = '') { + if (strlen($samples) % $this->_blockAlign != 0) { + throw new WavFileException('Incorrect samples size. Has to be a multiple of BlockAlign.'); + } + + $this->_samples = $samples; + + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + + return $this; + } + + + /*%******************************************************************************************%*/ + // Getters + + public function getMinAmplitude() + { + if ($this->_bitsPerSample == 8) { + return 0; + } elseif ($this->_bitsPerSample == 32) { + return -1.0; + } else { + return -(1 << ($this->_bitsPerSample - 1)); + } + } + + public function getZeroAmplitude() + { + if ($this->_bitsPerSample == 8) { + return 0x80; + } elseif ($this->_bitsPerSample == 32) { + return 0.0; + } else { + return 0; + } + } + + public function getMaxAmplitude() + { + if($this->_bitsPerSample == 8) { + return 0xFF; + } elseif($this->_bitsPerSample == 32) { + return 1.0; + } else { + return (1 << ($this->_bitsPerSample - 1)) - 1; + } + } + + + /*%******************************************************************************************%*/ + // Wave file methods + + /** + * Construct a wav header from this object. Includes "fact" chunk if necessary. + * http://www-mmsp.ece.mcgill.ca/documents/audioformats/wave/wave.html + * + * @return string The RIFF header data. + */ + public function makeHeader() + { + // reset and recalculate + $this->setAudioFormat(); // implicit setAudioSubFormat(), setFactChunkSize(), setFmtExtendedSize(), setFmtChunkSize(), setSize(), setActualSize(), setDataOffset() + $this->setNumBlocks(); + + // RIFF header + $header = pack('N', 0x52494646); // ChunkID - "RIFF" + $header .= pack('V', $this->getChunkSize()); // ChunkSize + $header .= pack('N', 0x57415645); // Format - "WAVE" + + // "fmt " subchunk + $header .= pack('N', 0x666d7420); // SubchunkID - "fmt " + $header .= pack('V', $this->getFmtChunkSize()); // SubchunkSize + $header .= pack('v', $this->getAudioFormat()); // AudioFormat + $header .= pack('v', $this->getNumChannels()); // NumChannels + $header .= pack('V', $this->getSampleRate()); // SampleRate + $header .= pack('V', $this->getByteRate()); // ByteRate + $header .= pack('v', $this->getBlockAlign()); // BlockAlign + $header .= pack('v', $this->getBitsPerSample()); // BitsPerSample + if($this->getFmtExtendedSize() == 24) { + $header .= pack('v', 22); // extension size = 24 bytes, cbSize: 24 - 2 = 22 bytes + $header .= pack('v', $this->getValidBitsPerSample()); // ValidBitsPerSample + $header .= pack('V', $this->getChannelMask()); // ChannelMask + $header .= pack('H32', $this->getAudioSubFormat()); // SubFormat + } elseif ($this->getFmtExtendedSize() == 2) { + $header .= pack('v', 0); // extension size = 2 bytes, cbSize: 2 - 2 = 0 bytes + } + + // "fact" subchunk + if ($this->getFactChunkSize() == 4) { + $header .= pack('N', 0x66616374); // SubchunkID - "fact" + $header .= pack('V', 4); // SubchunkSize + $header .= pack('V', $this->getNumBlocks()); // SampleLength (per channel) + } + + return $header; + } + + /** + * Construct wav DATA chunk. + * + * @return string The DATA header and chunk. + */ + public function getDataSubchunk() + { + // check preconditions + if (!$this->_dataSize_valid) { + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + } + + + // create subchunk + return pack('N', 0x64617461) . // SubchunkID - "data" + pack('V', $this->getDataSize()) . // SubchunkSize + $this->_samples . // Subchunk data + ($this->getDataSize() & 1 ? chr(0) : ''); // padding byte + } + + /** + * Save the wav data to a file. + * + * @param string $filename (Required) The file path to save the wav to. + * @throws WavFileException + */ + public function save($filename) + { + $fp = @fopen($filename, 'w+b'); + if (!is_resource($fp)) { + throw new WavFileException('Failed to open "' . $filename . '" for writing.'); + } + + fwrite($fp, $this->makeHeader()); + fwrite($fp, $this->getDataSubchunk()); + fclose($fp); + + return $this; + } + + /** + * Reads a wav header and data from a file. + * + * @param string $filename (Required) The path to the wav file to read. + * @param bool $readData (Optional) If true, also read the data chunk. + * @throws WavFormatException + * @throws WavFileException + */ + public function openWav($filename, $readData = true) + { + // check preconditions + if (!file_exists($filename)) { + throw new WavFileException('Failed to open "' . $filename . '". File not found.'); + } elseif (!is_readable($filename)) { + throw new WavFileException('Failed to open "' . $filename . '". File is not readable.'); + } elseif (is_resource($this->_fp)) { + $this->closeWav(); + } + + + // open the file + $this->_fp = @fopen($filename, 'rb'); + if (!is_resource($this->_fp)) { + throw new WavFileException('Failed to open "' . $filename . '".'); + } + + // read the file + return $this->readWav($readData); + } + + /** + * Close a with openWav() previously opened wav file or free the buffer of setWavData(). + * Not necessary if the data has been read (readData = true) already. + */ + public function closeWav() { + if (is_resource($this->_fp)) fclose($this->_fp); + + return $this; + } + + /** + * Set the wav file data and properties from a wav file in a string. + * + * @param string $data (Required) The wav file data. Passed by reference. + * @param bool $free (Optional) True to free the passed $data after copying. + * @throws WavFormatException + * @throws WavFileException + */ + public function setWavData(&$data, $free = true) + { + // check preconditions + if (is_resource($this->_fp)) $this->closeWav(); + + + // open temporary stream in memory + $this->_fp = @fopen('php://memory', 'w+b'); + if (!is_resource($this->_fp)) { + throw new WavFileException('Failed to open memory stream to write wav data. Use openWav() instead.'); + } + + // prepare stream + fwrite($this->_fp, $data); + rewind($this->_fp); + + // free the passed data + if ($free) $data = null; + + // read the stream like a file + return $this->readWav(true); + } + + /** + * Read wav file from a stream. + * + * @param bool $readData (Optional) If true, also read the data chunk. + * @throws WavFormatException + * @throws WavFileException + */ + protected function readWav($readData = true) + { + if (!is_resource($this->_fp)) { + throw new WavFileException('No wav file open. Use openWav() first.'); + } + + try { + $this->readWavHeader(); + } catch (WavFileException $ex) { + $this->closeWav(); + throw $ex; + } + + if ($readData) return $this->readWavData(); + + return $this; + } + + /** + * Parse a wav header. + * http://www-mmsp.ece.mcgill.ca/documents/audioformats/wave/wave.html + * + * @throws WavFormatException + * @throws WavFileException + */ + protected function readWavHeader() + { + if (!is_resource($this->_fp)) { + throw new WavFileException('No wav file open. Use openWav() first.'); + } + + // get actual file size + $stat = fstat($this->_fp); + $actualSize = $stat['size']; + + $this->_actualSize = $actualSize; + + + // read the common header + $header = fread($this->_fp, 36); // minimum size of the wav header + if (strlen($header) < 36) { + throw new WavFormatException('Not wav format. Header too short.', 1); + } + + + // check "RIFF" header + $RIFF = unpack('NChunkID/VChunkSize/NFormat', $header); + + if ($RIFF['ChunkID'] != 0x52494646) { // "RIFF" + throw new WavFormatException('Not wav format. "RIFF" signature missing.', 2); + } + + if ($this->getIgnoreChunkSizes()) { + $RIFF['ChunkSize'] = $actualSize - 8; + } else if ($actualSize - 8 < $RIFF['ChunkSize']) { + trigger_error('"RIFF" chunk size does not match actual file size. Found ' . $RIFF['ChunkSize'] . ', expected ' . ($actualSize - 8) . '.', E_USER_NOTICE); + $RIFF['ChunkSize'] = $actualSize - 8; + } + + if ($RIFF['Format'] != 0x57415645) { // "WAVE" + throw new WavFormatException('Not wav format. "RIFF" chunk format is not "WAVE".', 4); + } + + $this->_chunkSize = $RIFF['ChunkSize']; + + + // check common "fmt " subchunk + $fmt = unpack('NSubchunkID/VSubchunkSize/vAudioFormat/vNumChannels/' + .'VSampleRate/VByteRate/vBlockAlign/vBitsPerSample', + substr($header, 12)); + + if ($fmt['SubchunkID'] != 0x666d7420) { // "fmt " + throw new WavFormatException('Bad wav header. Expected "fmt " subchunk.', 11); + } + + if ($fmt['SubchunkSize'] < 16) { + throw new WavFormatException('Bad "fmt " subchunk size.', 12); + } + + if ( $fmt['AudioFormat'] != self::WAVE_FORMAT_PCM + && $fmt['AudioFormat'] != self::WAVE_FORMAT_IEEE_FLOAT + && $fmt['AudioFormat'] != self::WAVE_FORMAT_EXTENSIBLE) + { + throw new WavFormatException('Unsupported audio format. Only PCM or IEEE FLOAT (EXTENSIBLE) audio is supported.', 13); + } + + if ($fmt['NumChannels'] < 1 || $fmt['NumChannels'] > self::MAX_CHANNEL) { + throw new WavFormatException('Invalid number of channels in "fmt " subchunk.', 14); + } + + if ($fmt['SampleRate'] < 1 || $fmt['SampleRate'] > self::MAX_SAMPLERATE) { + throw new WavFormatException('Invalid sample rate in "fmt " subchunk.', 15); + } + + if ( ($fmt['AudioFormat'] == self::WAVE_FORMAT_PCM && !in_array($fmt['BitsPerSample'], array(8, 16, 24))) + || ($fmt['AudioFormat'] == self::WAVE_FORMAT_IEEE_FLOAT && $fmt['BitsPerSample'] != 32) + || ($fmt['AudioFormat'] == self::WAVE_FORMAT_EXTENSIBLE && !in_array($fmt['BitsPerSample'], array(8, 16, 24, 32)))) + { + throw new WavFormatException('Only 8, 16 and 24-bit PCM and 32-bit IEEE FLOAT (EXTENSIBLE) audio is supported.', 16); + } + + $blockAlign = $fmt['NumChannels'] * $fmt['BitsPerSample'] / 8; + if ($blockAlign != $fmt['BlockAlign']) { + trigger_error('Invalid block align in "fmt " subchunk. Found ' . $fmt['BlockAlign'] . ', expected ' . $blockAlign . '.', E_USER_NOTICE); + $fmt['BlockAlign'] = $blockAlign; + } + + $byteRate = $fmt['SampleRate'] * $blockAlign; + if ($byteRate != $fmt['ByteRate']) { + trigger_error('Invalid average byte rate in "fmt " subchunk. Found ' . $fmt['ByteRate'] . ', expected ' . $byteRate . '.', E_USER_NOTICE); + $fmt['ByteRate'] = $byteRate; + } + + $this->_fmtChunkSize = $fmt['SubchunkSize']; + $this->_audioFormat = $fmt['AudioFormat']; + $this->_numChannels = $fmt['NumChannels']; + $this->_sampleRate = $fmt['SampleRate']; + $this->_byteRate = $fmt['ByteRate']; + $this->_blockAlign = $fmt['BlockAlign']; + $this->_bitsPerSample = $fmt['BitsPerSample']; + + + // read extended "fmt " subchunk data + $extendedFmt = ''; + if ($fmt['SubchunkSize'] > 16) { + // possibly handle malformed subchunk without a padding byte + $extendedFmt = fread($this->_fp, $fmt['SubchunkSize'] - 16 + ($fmt['SubchunkSize'] & 1)); // also read padding byte + if (strlen($extendedFmt) < $fmt['SubchunkSize'] - 16) { + throw new WavFormatException('Not wav format. Header too short.', 1); + } + } + + + // check extended "fmt " for EXTENSIBLE Audio Format + if ($fmt['AudioFormat'] == self::WAVE_FORMAT_EXTENSIBLE) { + if (strlen($extendedFmt) < 24) { + throw new WavFormatException('Invalid EXTENSIBLE "fmt " subchunk size. Found ' . $fmt['SubchunkSize'] . ', expected at least 40.', 19); + } + + $extensibleFmt = unpack('vSize/vValidBitsPerSample/VChannelMask/H32SubFormat', substr($extendedFmt, 0, 24)); + + if ( $extensibleFmt['SubFormat'] != self::WAVE_SUBFORMAT_PCM + && $extensibleFmt['SubFormat'] != self::WAVE_SUBFORMAT_IEEE_FLOAT) + { + throw new WavFormatException('Unsupported audio format. Only PCM or IEEE FLOAT (EXTENSIBLE) audio is supported.', 13); + } + + if ( ($extensibleFmt['SubFormat'] == self::WAVE_SUBFORMAT_PCM && !in_array($fmt['BitsPerSample'], array(8, 16, 24))) + || ($extensibleFmt['SubFormat'] == self::WAVE_SUBFORMAT_IEEE_FLOAT && $fmt['BitsPerSample'] != 32)) + { + throw new WavFormatException('Only 8, 16 and 24-bit PCM and 32-bit IEEE FLOAT (EXTENSIBLE) audio is supported.', 16); + } + + if ($extensibleFmt['Size'] != 22) { + trigger_error('Invaid extension size in EXTENSIBLE "fmt " subchunk.', E_USER_NOTICE); + $extensibleFmt['Size'] = 22; + } + + if ($extensibleFmt['ValidBitsPerSample'] != $fmt['BitsPerSample']) { + trigger_error('Invaid or unsupported valid bits per sample in EXTENSIBLE "fmt " subchunk.', E_USER_NOTICE); + $extensibleFmt['ValidBitsPerSample'] = $fmt['BitsPerSample']; + } + + if ($extensibleFmt['ChannelMask'] != 0) { + // count number of set bits - Hamming weight + $c = (int)$extensibleFmt['ChannelMask']; + $n = 0; + while ($c > 0) { + $n += $c & 1; + $c >>= 1; + } + if ($n != $fmt['NumChannels'] || (((int)$extensibleFmt['ChannelMask'] | self::SPEAKER_ALL) != self::SPEAKER_ALL)) { + trigger_error('Invalid channel mask in EXTENSIBLE "fmt " subchunk. The number of channels does not match the number of locations in the mask.', E_USER_NOTICE); + $extensibleFmt['ChannelMask'] = 0; + } + } + + $this->_fmtExtendedSize = strlen($extendedFmt); + $this->_validBitsPerSample = $extensibleFmt['ValidBitsPerSample']; + $this->_channelMask = $extensibleFmt['ChannelMask']; + $this->_audioSubFormat = $extensibleFmt['SubFormat']; + + } else { + $this->_fmtExtendedSize = strlen($extendedFmt); + $this->_validBitsPerSample = $fmt['BitsPerSample']; + $this->_channelMask = 0; + $this->_audioSubFormat = null; + } + + + // read additional subchunks until "data" subchunk is found + $factSubchunk = array(); + $dataSubchunk = array(); + + while (!feof($this->_fp)) { + $subchunkHeader = fread($this->_fp, 8); + if (strlen($subchunkHeader) < 8) { + throw new WavFormatException('Missing "data" subchunk.', 101); + } + + $subchunk = unpack('NSubchunkID/VSubchunkSize', $subchunkHeader); + + if ($subchunk['SubchunkID'] == 0x66616374) { // "fact" + // possibly handle malformed subchunk without a padding byte + $subchunkData = fread($this->_fp, $subchunk['SubchunkSize'] + ($subchunk['SubchunkSize'] & 1)); // also read padding byte + if (strlen($subchunkData) < 4) { + throw new WavFormatException('Invalid "fact" subchunk.', 102); + } + + $factParams = unpack('VSampleLength', substr($subchunkData, 0, 4)); + $factSubchunk = array_merge($subchunk, $factParams); + + } elseif ($subchunk['SubchunkID'] == 0x64617461) { // "data" + $dataSubchunk = $subchunk; + + break; + + } elseif ($subchunk['SubchunkID'] == 0x7761766C) { // "wavl" + throw new WavFormatException('Wave List Chunk ("wavl" subchunk) is not supported.', 106); + } else { + // skip all other (unknown) subchunks + // possibly handle malformed subchunk without a padding byte + if ( $subchunk['SubchunkSize'] < 0 + || fseek($this->_fp, $subchunk['SubchunkSize'] + ($subchunk['SubchunkSize'] & 1), SEEK_CUR) !== 0) { // also skip padding byte + throw new WavFormatException('Invalid subchunk (0x' . dechex($subchunk['SubchunkID']) . ') encountered.', 103); + } + } + } + + if (empty($dataSubchunk)) { + throw new WavFormatException('Missing "data" subchunk.', 101); + } + + // check "data" subchunk + $dataOffset = ftell($this->_fp); + if ($this->getIgnoreChunkSizes()) { + $dataSubchunk['SubchunkSize'] = $actualSize - $dataOffset; + } elseif ($dataSubchunk['SubchunkSize'] < 0 || $actualSize - $dataOffset < $dataSubchunk['SubchunkSize']) { + trigger_error("Invalid \"data\" subchunk size (found {$dataSubchunk['SubchunkSize']}.", E_USER_NOTICE); + $dataSubchunk['SubchunkSize'] = $actualSize - $dataOffset; + } + + $this->_dataOffset = $dataOffset; + $this->_dataSize = $dataSubchunk['SubchunkSize']; + $this->_dataSize_fp = $dataSubchunk['SubchunkSize']; + $this->_dataSize_valid = false; + $this->_samples = ''; + + + // check "fact" subchunk + $numBlocks = (int)($dataSubchunk['SubchunkSize'] / $fmt['BlockAlign']); + + if (empty($factSubchunk)) { // construct fake "fact" subchunk + $factSubchunk = array('SubchunkSize' => 0, 'SampleLength' => $numBlocks); + } + + if ($factSubchunk['SampleLength'] != $numBlocks) { + trigger_error('Invalid sample length in "fact" subchunk.', E_USER_NOTICE); + $factSubchunk['SampleLength'] = $numBlocks; + } + + $this->_factChunkSize = $factSubchunk['SubchunkSize']; + $this->_numBlocks = $factSubchunk['SampleLength']; + + + return $this; + + } + + /** + * Read the wav data from the file into the buffer. + * + * @param int $dataOffset (Optional) The byte offset to skip before starting to read. Must be a multiple of BlockAlign. + * @param int $dataSize (Optional) The size of the data to read in bytes. Must be a multiple of BlockAlign. Defaults to all data. + * @throws WavFileException + */ + public function readWavData($dataOffset = 0, $dataSize = null) + { + // check preconditions + if (!is_resource($this->_fp)) { + throw new WavFileException('No wav file open. Use openWav() first.'); + } + + if ($dataOffset < 0 || $dataOffset % $this->getBlockAlign() > 0) { + throw new WavFileException('Invalid data offset. Has to be a multiple of BlockAlign.'); + } + + if (is_null($dataSize)) { + $dataSize = $this->_dataSize_fp - ($this->_dataSize_fp % $this->getBlockAlign()); // only read complete blocks + } elseif ($dataSize < 0 || $dataSize % $this->getBlockAlign() > 0) { + throw new WavFileException('Invalid data size to read. Has to be a multiple of BlockAlign.'); + } + + + // skip offset + if ($dataOffset > 0 && fseek($this->_fp, $dataOffset, SEEK_CUR) !== 0) { + throw new WavFileException('Seeking to data offset failed.'); + } + + // read data + $this->_samples .= fread($this->_fp, $dataSize); // allow appending + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + + // close file or memory stream + return $this->closeWav(); + } + + + /*%******************************************************************************************%*/ + // Sample manipulation methods + + /** + * Return a single sample block from the file. + * + * @param int $blockNum (Required) The sample block number. Zero based. + * @return string|null The binary sample block (all channels). Returns null if the sample block number was out of range. + */ + public function getSampleBlock($blockNum) + { + // check preconditions + if (!$this->_dataSize_valid) { + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + } + + $offset = $blockNum * $this->_blockAlign; + if ($offset + $this->_blockAlign > $this->_dataSize || $offset < 0) { + return null; + } + + + // read data + return substr($this->_samples, $offset, $this->_blockAlign); + } + + /** + * Set a single sample block.
+ * Allows to append a sample block. + * + * @param string $sampleBlock (Required) The binary sample block (all channels). + * @param int $blockNum (Required) The sample block number. Zero based. + * @throws WavFileException + */ + public function setSampleBlock($sampleBlock, $blockNum) + { + // check preconditions + $blockAlign = $this->_blockAlign; + if (!isset($sampleBlock[$blockAlign - 1]) || isset($sampleBlock[$blockAlign])) { // faster than: if (strlen($sampleBlock) != $blockAlign) + throw new WavFileException('Incorrect sample block size. Got ' . strlen($sampleBlock) . ', expected ' . $blockAlign . '.'); + } + + if (!$this->_dataSize_valid) { + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + } + + $numBlocks = (int)($this->_dataSize / $blockAlign); + $offset = $blockNum * $blockAlign; + if ($blockNum > $numBlocks || $blockNum < 0) { // allow appending + throw new WavFileException('Sample block number is out of range.'); + } + + + // replace or append data + if ($blockNum == $numBlocks) { + // append + $this->_samples .= $sampleBlock; + $this->_dataSize += $blockAlign; + $this->_chunkSize += $blockAlign; + $this->_actualSize += $blockAlign; + $this->_numBlocks++; + } else { + // replace + for ($i = 0; $i < $blockAlign; ++$i) { + $this->_samples[$offset + $i] = $sampleBlock[$i]; + } + } + + return $this; + } + + /** + * Get a float sample value for a specific sample block and channel number. + * + * @param int $blockNum (Required) The sample block number to fetch. Zero based. + * @param int $channelNum (Required) The channel number within the sample block to fetch. First channel is 1. + * @return float|null The float sample value. Returns null if the sample block number was out of range. + * @throws WavFileException + */ + public function getSampleValue($blockNum, $channelNum) + { + // check preconditions + if ($channelNum < 1 || $channelNum > $this->_numChannels) { + throw new WavFileException('Channel number is out of range.'); + } + + if (!$this->_dataSize_valid) { + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + } + + $sampleBytes = $this->_bitsPerSample / 8; + $offset = $blockNum * $this->_blockAlign + ($channelNum - 1) * $sampleBytes; + if ($offset + $sampleBytes > $this->_dataSize || $offset < 0) { + return null; + } + + // read binary value + $sampleBinary = substr($this->_samples, $offset, $sampleBytes); + + // convert binary to value + switch ($this->_bitsPerSample) { + case 8: + // unsigned char + return (float)((ord($sampleBinary) - 0x80) / 0x80); + + case 16: + // signed short, little endian + $data = unpack('v', $sampleBinary); + $sample = $data[1]; + if ($sample >= 0x8000) { + $sample -= 0x10000; + } + return (float)($sample / 0x8000); + + case 24: + // 3 byte packed signed integer, little endian + $data = unpack('C3', $sampleBinary); + $sample = $data[1] | ($data[2] << 8) | ($data[3] << 16); + if ($sample >= 0x800000) { + $sample -= 0x1000000; + } + return (float)($sample / 0x800000); + + case 32: + // 32-bit float + $data = unpack('f', $sampleBinary); + return (float)$data[1]; + + default: + return null; + } + } + + /** + * Sets a float sample value for a specific sample block number and channel.
+ * Converts float values to appropriate integer values and clips properly.
+ * Allows to append samples (in order). + * + * @param float $sampleFloat (Required) The float sample value to set. Converts float values and clips if necessary. + * @param int $blockNum (Required) The sample block number to set or append. Zero based. + * @param int $channelNum (Required) The channel number within the sample block to set or append. First channel is 1. + * @throws WavFileException + */ + public function setSampleValue($sampleFloat, $blockNum, $channelNum) + { + // check preconditions + if ($channelNum < 1 || $channelNum > $this->_numChannels) { + throw new WavFileException('Channel number is out of range.'); + } + + if (!$this->_dataSize_valid) { + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + } + + $dataSize = $this->_dataSize; + $bitsPerSample = $this->_bitsPerSample; + $sampleBytes = $bitsPerSample / 8; + $offset = $blockNum * $this->_blockAlign + ($channelNum - 1) * $sampleBytes; + if (($offset + $sampleBytes > $dataSize && $offset != $dataSize) || $offset < 0) { // allow appending + throw new WavFileException('Sample block or channel number is out of range.'); + } + + + // convert to value, quantize and clip + if ($bitsPerSample == 32) { + $sample = $sampleFloat < -1.0 ? -1.0 : ($sampleFloat > 1.0 ? 1.0 : $sampleFloat); + } else { + $p = 1 << ($bitsPerSample - 1); // 2 to the power of _bitsPerSample divided by 2 + + // project and quantize (round) float to integer values + $sample = $sampleFloat < 0 ? (int)($sampleFloat * $p - 0.5) : (int)($sampleFloat * $p + 0.5); + + // clip if necessary to [-$p, $p - 1] + if ($sample < -$p) { + $sample = -$p; + } elseif ($sample > $p - 1) { + $sample = $p - 1; + } + } + + // convert to binary + switch ($bitsPerSample) { + case 8: + // unsigned char + $sampleBinary = chr($sample + 0x80); + break; + + case 16: + // signed short, little endian + if ($sample < 0) { + $sample += 0x10000; + } + $sampleBinary = pack('v', $sample); + break; + + case 24: + // 3 byte packed signed integer, little endian + if ($sample < 0) { + $sample += 0x1000000; + } + $sampleBinary = pack('C3', $sample & 0xff, ($sample >> 8) & 0xff, ($sample >> 16) & 0xff); + break; + + case 32: + // 32-bit float + $sampleBinary = pack('f', $sample); + break; + + default: + $sampleBinary = null; + $sampleBytes = 0; + break; + } + + // replace or append data + if ($offset == $dataSize) { + // append + $this->_samples .= $sampleBinary; + $this->_dataSize += $sampleBytes; + $this->_chunkSize += $sampleBytes; + $this->_actualSize += $sampleBytes; + $this->_numBlocks = (int)($this->_dataSize / $this->_blockAlign); + } else { + // replace + for ($i = 0; $i < $sampleBytes; ++$i) { + $this->_samples{$offset + $i} = $sampleBinary{$i}; + } + } + + return $this; + } + + + /*%******************************************************************************************%*/ + // Audio processing methods + + /** + * Run samples through audio processing filters. + * + * + * $wav->filter( + * array( + * WavFile::FILTER_MIX => array( // Filter for mixing 2 WavFile instances. + * 'wav' => $wav2, // (Required) The WavFile to mix into this WhavFile. If no optional arguments are given, can be passed without the array. + * 'loop' => true, // (Optional) Loop the selected portion (with warping to the beginning at the end). + * 'blockOffset' => 0, // (Optional) Block number to start mixing from. + * 'numBlocks' => null // (Optional) Number of blocks to mix in or to select for looping. Defaults to the end or all data for looping. + * ), + * WavFile::FILTER_NORMALIZE => 0.6, // (Required) Normalization of (mixed) audio samples - see threshold parameter for normalizeSample(). + * WavFile::FILTER_DEGRADE => 0.9 // (Required) Introduce random noise. The quality relative to the amplitude. 1 = no noise, 0 = max. noise. + * WavFile::FILTER_VOLUME => 1.0 // (Required) Amplify or attenuate the audio signal. Beware of clipping when amplifying. Values range from >= 0 - <= 2. 1 = no change in volume; 0.5 = 50% reduction of volume; 1.5 = 150% increase in volume. + * ), + * 0, // (Optional) The block number of this WavFile to start with. + * null // (Optional) The number of blocks to process. + * ); + * + * + * @param array $filters (Required) An array of 1 or more audio processing filters. + * @param int $blockOffset (Optional) The block number to start precessing from. + * @param int $numBlocks (Optional) The maximum number of blocks to process. + * @throws WavFileException + */ + public function filter($filters, $blockOffset = 0, $numBlocks = null) + { + // check preconditions + $totalBlocks = $this->getNumBlocks(); + $numChannels = $this->getNumChannels(); + if (is_null($numBlocks)) $numBlocks = $totalBlocks - $blockOffset; + + if (!is_array($filters) || empty($filters) || $blockOffset < 0 || $blockOffset > $totalBlocks || $numBlocks <= 0) { + // nothing to do + return $this; + } + + // check filtes + $filter_mix = false; + if (array_key_exists(self::FILTER_MIX, $filters)) { + if (!is_array($filters[self::FILTER_MIX])) { + // assume the 'wav' parameter + $filters[self::FILTER_MIX] = array('wav' => $filters[self::FILTER_MIX]); + } + + $mix_wav = @$filters[self::FILTER_MIX]['wav']; + if (!($mix_wav instanceof WavFile)) { + throw new WavFileException("WavFile to mix is missing or invalid."); + } elseif ($mix_wav->getSampleRate() != $this->getSampleRate()) { + throw new WavFileException("Sample rate of WavFile to mix does not match."); + } else if ($mix_wav->getNumChannels() != $this->getNumChannels()) { + throw new WavFileException("Number of channels of WavFile to mix does not match."); + } + + $mix_loop = @$filters[self::FILTER_MIX]['loop']; + if (is_null($mix_loop)) $mix_loop = false; + + $mix_blockOffset = @$filters[self::FILTER_MIX]['blockOffset']; + if (is_null($mix_blockOffset)) $mix_blockOffset = 0; + + $mix_totalBlocks = $mix_wav->getNumBlocks(); + $mix_numBlocks = @$filters[self::FILTER_MIX]['numBlocks']; + if (is_null($mix_numBlocks)) $mix_numBlocks = $mix_loop ? $mix_totalBlocks : $mix_totalBlocks - $mix_blockOffset; + $mix_maxBlock = min($mix_blockOffset + $mix_numBlocks, $mix_totalBlocks); + + $filter_mix = true; + } + + $filter_normalize = false; + if (array_key_exists(self::FILTER_NORMALIZE, $filters)) { + $normalize_threshold = @$filters[self::FILTER_NORMALIZE]; + + if (!is_null($normalize_threshold) && abs($normalize_threshold) != 1) $filter_normalize = true; + } + + $filter_degrade = false; + if (array_key_exists(self::FILTER_DEGRADE, $filters)) { + $degrade_quality = @$filters[self::FILTER_DEGRADE]; + if (is_null($degrade_quality)) $degrade_quality = 1; + + if ($degrade_quality >= 0 && $degrade_quality < 1) $filter_degrade = true; + } + + $filter_vol = false; + if (array_key_exists(self::FILTER_VOLUME, $filters)) { + $volume_amount = @$filters[self::FILTER_VOLUME]; + if (is_null($volume_amount)) $volume_amount = 1; + + if ($volume_amount >= 0 && $volume_amount <= 2 && $volume_amount != 1.0) { + $filter_vol = true; + } + } + + + // loop through all sample blocks + for ($block = 0; $block < $numBlocks; ++$block) { + // loop through all channels + for ($channel = 1; $channel <= $numChannels; ++$channel) { + // read current sample + $currentBlock = $blockOffset + $block; + $sampleFloat = $this->getSampleValue($currentBlock, $channel); + + + /************* MIX FILTER ***********************/ + if ($filter_mix) { + if ($mix_loop) { + $mixBlock = ($mix_blockOffset + ($block % $mix_numBlocks)) % $mix_totalBlocks; + } else { + $mixBlock = $mix_blockOffset + $block; + } + + if ($mixBlock < $mix_maxBlock) { + $sampleFloat += $mix_wav->getSampleValue($mixBlock, $channel); + } + } + + /************* NORMALIZE FILTER *******************/ + if ($filter_normalize) { + $sampleFloat = $this->normalizeSample($sampleFloat, $normalize_threshold); + } + + /************* DEGRADE FILTER *******************/ + if ($filter_degrade) { + $sampleFloat += rand(1000000 * ($degrade_quality - 1), 1000000 * (1 - $degrade_quality)) / 1000000; + } + + /************* VOLUME FILTER *******************/ + if ($filter_vol) { + $sampleFloat *= $volume_amount; + } + + // write current sample + $this->setSampleValue($sampleFloat, $currentBlock, $channel); + } + } + + return $this; + } + + /** + * Append a wav file to the current wav.
+ * The wav files must have the same sample rate, number of bits per sample, and number of channels. + * + * @param WavFile $wav (Required) The wav file to append. + * @throws WavFileException + */ + public function appendWav(WavFile $wav) { + // basic checks + if ($wav->getSampleRate() != $this->getSampleRate()) { + throw new WavFileException("Sample rate for wav files do not match."); + } else if ($wav->getBitsPerSample() != $this->getBitsPerSample()) { + throw new WavFileException("Bits per sample for wav files do not match."); + } else if ($wav->getNumChannels() != $this->getNumChannels()) { + throw new WavFileException("Number of channels for wav files do not match."); + } + + $this->_samples .= $wav->_samples; + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + + return $this; + } + + /** + * Mix 2 wav files together.
+ * Both wavs must have the same sample rate and same number of channels. + * + * @param WavFile $wav (Required) The WavFile to mix. + * @param float $normalizeThreshold (Optional) See normalizeSample for an explanation. + * @throws WavFileException + */ + public function mergeWav(WavFile $wav, $normalizeThreshold = null) { + return $this->filter(array( + WavFile::FILTER_MIX => $wav, + WavFile::FILTER_NORMALIZE => $normalizeThreshold + )); + } + + /** + * Add silence to the wav file. + * + * @param float $duration (Optional) How many seconds of silence. If negative, add to the beginning of the file. Defaults to 1s. + */ + public function insertSilence($duration = 1.0) + { + $numSamples = (int)($this->getSampleRate() * abs($duration)); + $numChannels = $this->getNumChannels(); + + $data = str_repeat(self::packSample($this->getZeroAmplitude(), $this->getBitsPerSample()), $numSamples * $numChannels); + if ($duration >= 0) { + $this->_samples .= $data; + } else { + $this->_samples = $data . $this->_samples; + } + + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + + return $this; + } + + /** + * Degrade the quality of the wav file by introducing random noise. + * + * @param float quality (Optional) The quality relative to the amplitude. 1 = no noise, 0 = max. noise. + */ + public function degrade($quality = 1.0) + { + return $this->filter(array( + self::FILTER_DEGRADE => $quality + )); + } + + /** + * Generate noise at the end of the wav for the specified duration and volume. + * + * @param float $duration (Optional) Number of seconds of noise to generate. + * @param float $percent (Optional) The percentage of the maximum amplitude to use. 100 = full amplitude. + */ + public function generateNoise($duration = 1.0, $percent = 100) + { + $numChannels = $this->getNumChannels(); + $numSamples = $this->getSampleRate() * $duration; + $minAmp = $this->getMinAmplitude(); + $maxAmp = $this->getMaxAmplitude(); + $bitDepth = $this->getBitsPerSample(); + + for ($s = 0; $s < $numSamples; ++$s) { + if ($bitDepth == 32) { + $val = rand(-$percent * 10000, $percent * 10000) / 1000000; + } else { + $val = rand($minAmp, $maxAmp); + $val = (int)($val * $percent / 100); + } + + $this->_samples .= str_repeat(self::packSample($val, $bitDepth), $numChannels); + } + + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + + return $this; + } + + /** + * Convert sample data to different bits per sample. + * + * @param int $bitsPerSample (Required) The new number of bits per sample; + * @throws WavFileException + */ + public function convertBitsPerSample($bitsPerSample) { + if ($this->getBitsPerSample() == $bitsPerSample) { + return $this; + } + + $tempWav = new WavFile($this->getNumChannels(), $this->getSampleRate(), $bitsPerSample); + $tempWav->filter( + array(self::FILTER_MIX => $this), + 0, + $this->getNumBlocks() + ); + + $this->setSamples() // implicit setDataSize(), setSize(), setActualSize(), setNumBlocks() + ->setBitsPerSample($bitsPerSample); // implicit setValidBitsPerSample(), setAudioFormat(), setAudioSubFormat(), setFmtChunkSize(), setFactChunkSize(), setSize(), setActualSize(), setDataOffset(), setByteRate(), setBlockAlign(), setNumBlocks() + $this->_samples = $tempWav->_samples; + $this->setDataSize(); // implicit setSize(), setActualSize(), setNumBlocks() + + return $this; + } + + + /*%******************************************************************************************%*/ + // Miscellaneous methods + + /** + * Output information about the wav object. + */ + public function displayInfo() + { + $s = "File Size: %u\n" + ."Chunk Size: %u\n" + ."fmt Subchunk Size: %u\n" + ."Extended fmt Size: %u\n" + ."fact Subchunk Size: %u\n" + ."Data Offset: %u\n" + ."Data Size: %u\n" + ."Audio Format: %s\n" + ."Audio SubFormat: %s\n" + ."Channels: %u\n" + ."Channel Mask: 0x%s\n" + ."Sample Rate: %u\n" + ."Bits Per Sample: %u\n" + ."Valid Bits Per Sample: %u\n" + ."Sample Block Size: %u\n" + ."Number of Sample Blocks: %u\n" + ."Byte Rate: %uBps\n"; + + $s = sprintf($s, $this->getActualSize(), + $this->getChunkSize(), + $this->getFmtChunkSize(), + $this->getFmtExtendedSize(), + $this->getFactChunkSize(), + $this->getDataOffset(), + $this->getDataSize(), + $this->getAudioFormat() == self::WAVE_FORMAT_PCM ? 'PCM' : ($this->getAudioFormat() == self::WAVE_FORMAT_IEEE_FLOAT ? 'IEEE FLOAT' : 'EXTENSIBLE'), + $this->getAudioSubFormat() == self::WAVE_SUBFORMAT_PCM ? 'PCM' : 'IEEE FLOAT', + $this->getNumChannels(), + dechex($this->getChannelMask()), + $this->getSampleRate(), + $this->getBitsPerSample(), + $this->getValidBitsPerSample(), + $this->getBlockAlign(), + $this->getNumBlocks(), + $this->getByteRate()); + + if (php_sapi_name() == 'cli') { + return $s; + } else { + return nl2br($s); + } + } +} + + +/*%******************************************************************************************%*/ +// Exceptions + +/** + * WavFileException indicates an illegal state or argument in this class. + */ +class WavFileException extends Exception {} + +/** + * WavFormatException indicates a malformed or unsupported wav file header. + */ +class WavFormatException extends WavFileException {} diff --git a/html/securimage/audio/.htaccess b/html/securimage/audio/.htaccess new file mode 100755 index 0000000..4fdb24a --- /dev/null +++ b/html/securimage/audio/.htaccess @@ -0,0 +1,11 @@ +# Deny access to this folder + +# Apache 2.4 + + Require all denied + + +# Apache 2.2 + + Deny from all + diff --git a/html/securimage/audio/en/0.wav b/html/securimage/audio/en/0.wav new file mode 100755 index 0000000..8a4bd6f Binary files /dev/null and b/html/securimage/audio/en/0.wav differ diff --git a/html/securimage/audio/en/1.wav b/html/securimage/audio/en/1.wav new file mode 100755 index 0000000..fa15b45 Binary files /dev/null and b/html/securimage/audio/en/1.wav differ diff --git a/html/securimage/audio/en/10.wav b/html/securimage/audio/en/10.wav new file mode 100755 index 0000000..c6d7d73 Binary files /dev/null and b/html/securimage/audio/en/10.wav differ diff --git a/html/securimage/audio/en/11.wav b/html/securimage/audio/en/11.wav new file mode 100755 index 0000000..ccc3494 Binary files /dev/null and b/html/securimage/audio/en/11.wav differ diff --git a/html/securimage/audio/en/12.wav b/html/securimage/audio/en/12.wav new file mode 100755 index 0000000..bf8f8ef Binary files /dev/null and b/html/securimage/audio/en/12.wav differ diff --git a/html/securimage/audio/en/13.wav b/html/securimage/audio/en/13.wav new file mode 100755 index 0000000..0166cce Binary files /dev/null and b/html/securimage/audio/en/13.wav differ diff --git a/html/securimage/audio/en/14.wav b/html/securimage/audio/en/14.wav new file mode 100755 index 0000000..01317a2 Binary files /dev/null and b/html/securimage/audio/en/14.wav differ diff --git a/html/securimage/audio/en/15.wav b/html/securimage/audio/en/15.wav new file mode 100755 index 0000000..c0d8e3f Binary files /dev/null and b/html/securimage/audio/en/15.wav differ diff --git a/html/securimage/audio/en/16.wav b/html/securimage/audio/en/16.wav new file mode 100755 index 0000000..c310394 Binary files /dev/null and b/html/securimage/audio/en/16.wav differ diff --git a/html/securimage/audio/en/17.wav b/html/securimage/audio/en/17.wav new file mode 100755 index 0000000..1577a40 Binary files /dev/null and b/html/securimage/audio/en/17.wav differ diff --git a/html/securimage/audio/en/18.wav b/html/securimage/audio/en/18.wav new file mode 100755 index 0000000..3caa0f2 Binary files /dev/null and b/html/securimage/audio/en/18.wav differ diff --git a/html/securimage/audio/en/19.wav b/html/securimage/audio/en/19.wav new file mode 100755 index 0000000..54f8243 Binary files /dev/null and b/html/securimage/audio/en/19.wav differ diff --git a/html/securimage/audio/en/2.wav b/html/securimage/audio/en/2.wav new file mode 100755 index 0000000..2e61d24 Binary files /dev/null and b/html/securimage/audio/en/2.wav differ diff --git a/html/securimage/audio/en/20.wav b/html/securimage/audio/en/20.wav new file mode 100755 index 0000000..abeebe3 Binary files /dev/null and b/html/securimage/audio/en/20.wav differ diff --git a/html/securimage/audio/en/3.wav b/html/securimage/audio/en/3.wav new file mode 100755 index 0000000..03c6157 Binary files /dev/null and b/html/securimage/audio/en/3.wav differ diff --git a/html/securimage/audio/en/4.wav b/html/securimage/audio/en/4.wav new file mode 100755 index 0000000..61834ca Binary files /dev/null and b/html/securimage/audio/en/4.wav differ diff --git a/html/securimage/audio/en/5.wav b/html/securimage/audio/en/5.wav new file mode 100755 index 0000000..6836a6e Binary files /dev/null and b/html/securimage/audio/en/5.wav differ diff --git a/html/securimage/audio/en/6.wav b/html/securimage/audio/en/6.wav new file mode 100755 index 0000000..01c85f0 Binary files /dev/null and b/html/securimage/audio/en/6.wav differ diff --git a/html/securimage/audio/en/7.wav b/html/securimage/audio/en/7.wav new file mode 100755 index 0000000..91b96fa Binary files /dev/null and b/html/securimage/audio/en/7.wav differ diff --git a/html/securimage/audio/en/8.wav b/html/securimage/audio/en/8.wav new file mode 100755 index 0000000..16cf893 Binary files /dev/null and b/html/securimage/audio/en/8.wav differ diff --git a/html/securimage/audio/en/9.wav b/html/securimage/audio/en/9.wav new file mode 100755 index 0000000..e81e2a4 Binary files /dev/null and b/html/securimage/audio/en/9.wav differ diff --git a/html/securimage/audio/en/A.wav b/html/securimage/audio/en/A.wav new file mode 100755 index 0000000..047d906 Binary files /dev/null and b/html/securimage/audio/en/A.wav differ diff --git a/html/securimage/audio/en/B.wav b/html/securimage/audio/en/B.wav new file mode 100755 index 0000000..34c1f19 Binary files /dev/null and b/html/securimage/audio/en/B.wav differ diff --git a/html/securimage/audio/en/C.wav b/html/securimage/audio/en/C.wav new file mode 100755 index 0000000..933f428 Binary files /dev/null and b/html/securimage/audio/en/C.wav differ diff --git a/html/securimage/audio/en/D.wav b/html/securimage/audio/en/D.wav new file mode 100755 index 0000000..1879e0b Binary files /dev/null and b/html/securimage/audio/en/D.wav differ diff --git a/html/securimage/audio/en/E.wav b/html/securimage/audio/en/E.wav new file mode 100755 index 0000000..7e483f1 Binary files /dev/null and b/html/securimage/audio/en/E.wav differ diff --git a/html/securimage/audio/en/F.wav b/html/securimage/audio/en/F.wav new file mode 100755 index 0000000..ab3f6b9 Binary files /dev/null and b/html/securimage/audio/en/F.wav differ diff --git a/html/securimage/audio/en/G.wav b/html/securimage/audio/en/G.wav new file mode 100755 index 0000000..517b38b Binary files /dev/null and b/html/securimage/audio/en/G.wav differ diff --git a/html/securimage/audio/en/H.wav b/html/securimage/audio/en/H.wav new file mode 100755 index 0000000..5f4e761 Binary files /dev/null and b/html/securimage/audio/en/H.wav differ diff --git a/html/securimage/audio/en/I.wav b/html/securimage/audio/en/I.wav new file mode 100755 index 0000000..7450519 Binary files /dev/null and b/html/securimage/audio/en/I.wav differ diff --git a/html/securimage/audio/en/J.wav b/html/securimage/audio/en/J.wav new file mode 100755 index 0000000..7870a5c Binary files /dev/null and b/html/securimage/audio/en/J.wav differ diff --git a/html/securimage/audio/en/K.wav b/html/securimage/audio/en/K.wav new file mode 100755 index 0000000..38fbdfe Binary files /dev/null and b/html/securimage/audio/en/K.wav differ diff --git a/html/securimage/audio/en/L.wav b/html/securimage/audio/en/L.wav new file mode 100755 index 0000000..da7e679 Binary files /dev/null and b/html/securimage/audio/en/L.wav differ diff --git a/html/securimage/audio/en/M.wav b/html/securimage/audio/en/M.wav new file mode 100755 index 0000000..c61aab3 Binary files /dev/null and b/html/securimage/audio/en/M.wav differ diff --git a/html/securimage/audio/en/MINUS.wav b/html/securimage/audio/en/MINUS.wav new file mode 100755 index 0000000..cb2c086 Binary files /dev/null and b/html/securimage/audio/en/MINUS.wav differ diff --git a/html/securimage/audio/en/N.wav b/html/securimage/audio/en/N.wav new file mode 100755 index 0000000..280ef23 Binary files /dev/null and b/html/securimage/audio/en/N.wav differ diff --git a/html/securimage/audio/en/O.wav b/html/securimage/audio/en/O.wav new file mode 100755 index 0000000..98068d5 Binary files /dev/null and b/html/securimage/audio/en/O.wav differ diff --git a/html/securimage/audio/en/P.wav b/html/securimage/audio/en/P.wav new file mode 100755 index 0000000..546aa73 Binary files /dev/null and b/html/securimage/audio/en/P.wav differ diff --git a/html/securimage/audio/en/PLUS.wav b/html/securimage/audio/en/PLUS.wav new file mode 100755 index 0000000..f340b6c Binary files /dev/null and b/html/securimage/audio/en/PLUS.wav differ diff --git a/html/securimage/audio/en/Q.wav b/html/securimage/audio/en/Q.wav new file mode 100755 index 0000000..57cafe5 Binary files /dev/null and b/html/securimage/audio/en/Q.wav differ diff --git a/html/securimage/audio/en/R.wav b/html/securimage/audio/en/R.wav new file mode 100755 index 0000000..e16d66b Binary files /dev/null and b/html/securimage/audio/en/R.wav differ diff --git a/html/securimage/audio/en/S.wav b/html/securimage/audio/en/S.wav new file mode 100755 index 0000000..1b4fb82 Binary files /dev/null and b/html/securimage/audio/en/S.wav differ diff --git a/html/securimage/audio/en/T.wav b/html/securimage/audio/en/T.wav new file mode 100755 index 0000000..347be1a Binary files /dev/null and b/html/securimage/audio/en/T.wav differ diff --git a/html/securimage/audio/en/TIMES.wav b/html/securimage/audio/en/TIMES.wav new file mode 100755 index 0000000..85692b8 Binary files /dev/null and b/html/securimage/audio/en/TIMES.wav differ diff --git a/html/securimage/audio/en/U.wav b/html/securimage/audio/en/U.wav new file mode 100755 index 0000000..2ffc415 Binary files /dev/null and b/html/securimage/audio/en/U.wav differ diff --git a/html/securimage/audio/en/V.wav b/html/securimage/audio/en/V.wav new file mode 100755 index 0000000..a9748c7 Binary files /dev/null and b/html/securimage/audio/en/V.wav differ diff --git a/html/securimage/audio/en/W.wav b/html/securimage/audio/en/W.wav new file mode 100755 index 0000000..aaa9f79 Binary files /dev/null and b/html/securimage/audio/en/W.wav differ diff --git a/html/securimage/audio/en/X.wav b/html/securimage/audio/en/X.wav new file mode 100755 index 0000000..39253b0 Binary files /dev/null and b/html/securimage/audio/en/X.wav differ diff --git a/html/securimage/audio/en/Y.wav b/html/securimage/audio/en/Y.wav new file mode 100755 index 0000000..be62a8f Binary files /dev/null and b/html/securimage/audio/en/Y.wav differ diff --git a/html/securimage/audio/en/Z.wav b/html/securimage/audio/en/Z.wav new file mode 100755 index 0000000..8714c83 Binary files /dev/null and b/html/securimage/audio/en/Z.wav differ diff --git a/html/securimage/audio/en/error.wav b/html/securimage/audio/en/error.wav new file mode 100755 index 0000000..35209ab Binary files /dev/null and b/html/securimage/audio/en/error.wav differ diff --git a/html/securimage/audio/noise/check-point-1.wav b/html/securimage/audio/noise/check-point-1.wav new file mode 100755 index 0000000..9533b12 Binary files /dev/null and b/html/securimage/audio/noise/check-point-1.wav differ diff --git a/html/securimage/audio/noise/crowd-talking-1.wav b/html/securimage/audio/noise/crowd-talking-1.wav new file mode 100755 index 0000000..7f451df Binary files /dev/null and b/html/securimage/audio/noise/crowd-talking-1.wav differ diff --git a/html/securimage/audio/noise/crowd-talking-6.wav b/html/securimage/audio/noise/crowd-talking-6.wav new file mode 100755 index 0000000..fd9a10d Binary files /dev/null and b/html/securimage/audio/noise/crowd-talking-6.wav differ diff --git a/html/securimage/audio/noise/crowd-talking-7.wav b/html/securimage/audio/noise/crowd-talking-7.wav new file mode 100755 index 0000000..986f6ae Binary files /dev/null and b/html/securimage/audio/noise/crowd-talking-7.wav differ diff --git a/html/securimage/audio/noise/kids-playing-1.wav b/html/securimage/audio/noise/kids-playing-1.wav new file mode 100755 index 0000000..cb9d17b Binary files /dev/null and b/html/securimage/audio/noise/kids-playing-1.wav differ diff --git a/html/securimage/backgrounds/bg3.jpg b/html/securimage/backgrounds/bg3.jpg new file mode 100755 index 0000000..a2d62d6 Binary files /dev/null and b/html/securimage/backgrounds/bg3.jpg differ diff --git a/html/securimage/backgrounds/bg4.jpg b/html/securimage/backgrounds/bg4.jpg new file mode 100755 index 0000000..37a22f8 Binary files /dev/null and b/html/securimage/backgrounds/bg4.jpg differ diff --git a/html/securimage/backgrounds/bg5.jpg b/html/securimage/backgrounds/bg5.jpg new file mode 100755 index 0000000..0a04181 Binary files /dev/null and b/html/securimage/backgrounds/bg5.jpg differ diff --git a/html/securimage/backgrounds/bg6.png b/html/securimage/backgrounds/bg6.png new file mode 100755 index 0000000..22f9d67 Binary files /dev/null and b/html/securimage/backgrounds/bg6.png differ diff --git a/html/securimage/captcha.html b/html/securimage/captcha.html new file mode 100755 index 0000000..0343fb0 --- /dev/null +++ b/html/securimage/captcha.html @@ -0,0 +1,136 @@ + + + + + + + Sample CAPTCHA HTML + + + + +

Note: Running this on a PHP enabled server will likely work, but you should use example_form.php for testing instead.

+ +
+ CAPTCHA Image +
+ +
+
+ + Play CAPTCHA Audio + + + +
+ + + Refresh Image + +
+ + + +
+ + + +
+ + + + + +

+ + +
+ + CAPTCHA Image + + +
+ + +
+ +
+ + + Play CAPTCHA Audio + + + +
+ + + + Refresh Image + +
+ + + + +
+ + + + +
+ + + diff --git a/html/securimage/config.inc.php.SAMPLE b/html/securimage/config.inc.php.SAMPLE new file mode 100755 index 0000000..43dcd54 --- /dev/null +++ b/html/securimage/config.inc.php.SAMPLE @@ -0,0 +1,87 @@ + 275, // width of captcha image in pixels + 'image_height' => 100, // height of captcha image in pixels + 'code_length' => 6, // # of characters for captcha code + 'image_bg_color' => '#ffffff', // hex color for image background + 'text_color' => '#707070', // hex color for captcha text + 'line_color' => '#707070', // hex color for lines over text + 'noise_color' => '#707070', // color of random noise to draw under text + 'num_lines' => 3, // # of lines to draw over text + 'noise_level' => 4, // how much random noise to add (0-10) + 'perturbation' => 0.7, // distoration level + + 'use_random_spaces' => true, + 'use_random_baseline' => true, + 'use_text_angles' => true, + 'use_random_boxes' => false, + + 'wordlist_file' => 'words/words.txt', // text file for word captcha + 'use_wordlist' => false, // true to use word list + 'wordlist_file_encoding' => null, // character encoding of word file if other than ASCII (e.g. UTF-8, GB2312) + + // example UTF-8 charset (TTF file must support symbols being used + // 'charset' => "абвгдeжзийклмнопрстуфхцчшщъьюяАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ", + + 'ttf_file' => './AHGBold.ttf', // TTF file for captcha text + + //'captcha_type' => Securimage::SI_CAPTCHA_WORDS, // Securimage::SI_CAPTCHA_STRING || Securimage:: SI_CAPTCHA_MATHEMATIC || Securimage::SI_CAPTCHA_WORDS + + //'display_value' => 'ABC 123', // Draws custom text on captcha + + + /**** Code Storage & Database Options ****/ + + // true if you *DO NOT* want to use PHP sessions at all, false to use PHP sessions + 'no_session' => false, + + // the PHP session name to use (null for default PHP session name) + // do not change unless you know what you are doing + 'session_name' => null, + + // change to true to store codes in a database + 'use_database' => false, + + // database engine to use for storing codes. must have the PDO extension loaded + // Values choices are: + // Securimage::SI_DRIVER_MYSQL, Securimage::SI_DRIVER_SQLITE3, Securimage::SI_DRIVER_PGSQL + 'database_driver' => Securimage::SI_DRIVER_MYSQL, + + 'database_host' => 'localhost', // database server host to connect to + 'database_user' => 'root', // database user to connect as + 'database_pass' => '', // database user password + 'database_name' => 'securimage', // name of database to select (you must create this first or use an existing database) + 'database_table' => 'captcha_codes', // database table for storing codes, will be created automatically + + // Securimage will automatically create the database table if it is not found + // change to true for performance reasons once database table is up and running + 'skip_table_check' => false, + + /**** Audio Options ****/ + + //'audio_path' => __DIR__ . '/audio/en/', + //'audio_use_noise' => true, + //'audio_noise_path' => __DIR__ . '/audio/noise/', + //'degrade_audio' => true, +); diff --git a/html/securimage/database/.htaccess b/html/securimage/database/.htaccess new file mode 100755 index 0000000..4fdb24a --- /dev/null +++ b/html/securimage/database/.htaccess @@ -0,0 +1,11 @@ +# Deny access to this folder + +# Apache 2.4 + + Require all denied + + +# Apache 2.2 + + Deny from all + diff --git a/html/securimage/database/index.html b/html/securimage/database/index.html new file mode 100755 index 0000000..8d1c8b6 --- /dev/null +++ b/html/securimage/database/index.html @@ -0,0 +1 @@ + diff --git a/html/securimage/database/securimage.sq3 b/html/securimage/database/securimage.sq3 new file mode 100755 index 0000000..a3fcbd7 Binary files /dev/null and b/html/securimage/database/securimage.sq3 differ diff --git a/html/securimage/example_form.ajax.php b/html/securimage/example_form.ajax.php new file mode 100755 index 0000000..e066f22 --- /dev/null +++ b/html/securimage/example_form.ajax.php @@ -0,0 +1,205 @@ + + + + + + Securimage Example Form + + + + +
+Example Form + +

+ This is an example PHP form that processes user information, checks for errors, and validates the captcha code.
+ This example form also demonstrates how to submit a form to itself to display error messages. +

+ + + +
+ + +

+ Name*:
+ +

+ +

+ Email*:
+ +

+ +

+ URL:
+ +

+ +

+ Message*:
+ +

+ +

+ 'ct_captcha')); ?> +

+ +

+
+ +

+ +
+
+ + + + + + + + $value) { + if (!is_array($key)) { + // sanitize the input data + if ($key != 'ct_message') $value = strip_tags($value); + $_POST[$key] = htmlspecialchars(stripslashes(trim($value))); + } + } + + $name = @$_POST['ct_name']; // name from the form + $email = @$_POST['ct_email']; // email from the form + $URL = @$_POST['ct_URL']; // url from the form + $message = @$_POST['ct_message']; // the message from the form + $captcha = @$_POST['ct_captcha']; // the user's entry for the captcha code + $name = substr($name, 0, 64); // limit name to 64 characters + + $errors = array(); // initialize empty error array + + if (isset($GLOBALS['DEBUG_MODE']) && $GLOBALS['DEBUG_MODE'] == false) { + // only check for errors if the form is not in debug mode + + if (strlen($name) < 3) { + // name too short, add error + $errors['name_error'] = 'Your name is required'; + } + + if (strlen($email) == 0) { + // no email address given + $errors['email_error'] = 'Email address is required'; + } else if ( !preg_match('/^(?:[\w\d-]+\.?)+@(?:(?:[\w\d]\-?)+\.)+\w{2,4}$/i', $email)) { + // invalid email format + $errors['email_error'] = 'Email address entered is invalid'; + } + + if (strlen($message) < 20) { + // message length too short + $errors['message_error'] = 'Please enter a message'; + } + } + + // Only try to validate the captcha if the form has no errors + // This is especially important for ajax calls + if (sizeof($errors) == 0) { + require_once dirname(__FILE__) . '/securimage.php'; + $securimage = new Securimage(); + + if ($securimage->check($captcha) == false) { + $errors['captcha_error'] = 'Incorrect security code entered'; + } + } + + if (sizeof($errors) == 0) { + // no errors, send the form + $time = date('r'); + $message = "A message was submitted from the contact form. The following information was provided.

" + . "Name: $name
" + . "Email: $email
" + . "URL: $URL
" + . "Message:
" + . "
$message
" + . "

IP Address: {$_SERVER['REMOTE_ADDR']}
" + . "Time: $time
" + . "Browser: " . htmlspecialchars($_SERVER['HTTP_USER_AGENT']) . "
"; + + if (isset($GLOBALS['DEBUG_MODE']) && $GLOBALS['DEBUG_MODE'] == false) { + // send the message with mail() + mail($GLOBALS['ct_recipient'], $GLOBALS['ct_msg_subject'], $message, "From: {$GLOBALS['ct_recipient']}\r\nReply-To: {$email}\r\nContent-type: text/html; charset=ISO-8859-1\r\nMIME-Version: 1.0"); + } + + $return = array('error' => 0, 'message' => 'OK'); + die(json_encode($return)); + } else { + $errmsg = ''; + foreach($errors as $key => $error) { + // set up error messages to display with each field + $errmsg .= " - {$error}\n"; + } + + $return = array('error' => 1, 'message' => $errmsg); + die(json_encode($return)); + } + } // POST +} // function process_si_contact_form() diff --git a/html/securimage/example_form.php b/html/securimage/example_form.php new file mode 100755 index 0000000..8e726d8 --- /dev/null +++ b/html/securimage/example_form.php @@ -0,0 +1,232 @@ + + + + + + + Securimage Example Form + + + + + +
+Example Form + +

+ This is an example PHP form that processes user information, checks for errors, and validates the captcha code.
+ This example form also demonstrates how to submit a form to itself to display error messages. +

+ + +
There was a problem with your submission. Errors are displayed below in red.

+ +
The captcha was correct and the message has been sent! The captcha was solved in seconds.

+ + +
+ + +

+ + + +

+ +

+ + + +

+ +

+ + + +

+ +

+ + + +

+ +
+ \n"; + echo Securimage::getCaptchaHtml($options); + echo "\n
\n"; + + /* + // To render some or all captcha components individually + $options['input_name'] = 'ct_captcha_2'; + $options['image_id'] = 'ct_captcha_2'; + $options['input_id'] = 'ct_captcha_2'; + $options['namespace'] = 'captcha2'; + + echo "
\n
\n"; + echo Securimage::getCaptchaHtml($options, Securimage::HTML_IMG); + + echo Securimage::getCaptchaHtml($options, Securimage::HTML_ICON_REFRESH); + echo Securimage::getCaptchaHtml($options, Securimage::HTML_AUDIO); + + echo '
'; + + echo Securimage::getCaptchaHtml($options, Securimage::HTML_INPUT_LABEL); + echo Securimage::getCaptchaHtml($options, Securimage::HTML_INPUT); + echo "\n
"; + */ + ?> + + +

+
+ +

+ +
+
+ + + + + $value) { + if (!is_array($key)) { + // sanitize the input data + if ($key != 'ct_message') $value = strip_tags($value); + $_POST[$key] = htmlspecialchars(stripslashes(trim($value))); + } + } + + $name = @$_POST['ct_name']; // name from the form + $email = @$_POST['ct_email']; // email from the form + $URL = @$_POST['ct_URL']; // url from the form + $message = @$_POST['ct_message']; // the message from the form + $captcha = @$_POST['ct_captcha']; // the user's entry for the captcha code + $name = substr($name, 0, 64); // limit name to 64 characters + + $errors = array(); // initialize empty error array + + if (isset($GLOBALS['DEBUG_MODE']) && $GLOBALS['DEBUG_MODE'] == false) { + // only check for errors if the form is not in debug mode + + if (strlen($name) < 3) { + // name too short, add error + $errors['name_error'] = 'Your name is required'; + } + + if (strlen($email) == 0) { + // no email address given + $errors['email_error'] = 'Email address is required'; + } else if ( !preg_match('/^(?:[\w\d-]+\.?)+@(?:(?:[\w\d]\-?)+\.)+\w{2,63}$/i', $email)) { + // invalid email format + $errors['email_error'] = 'Email address entered is invalid'; + } + + if (strlen($message) < 20) { + // message length too short + $errors['message_error'] = 'Your message must be longer than 20 characters'; + } + } + + // Only try to validate the captcha if the form has no errors + // This is especially important for ajax calls + if (sizeof($errors) == 0) { + require_once dirname(__FILE__) . '/securimage.php'; + $securimage = new Securimage(); + + if ($securimage->check($captcha) == false) { + $errors['captcha_error'] = 'Incorrect security code entered
'; + } + } + + if (sizeof($errors) == 0) { + // no errors, send the form + $time = date('r'); + $message = "A message was submitted from the contact form. The following information was provided.

" + . "Name: $name
" + . "Email: $email
" + . "URL: $URL
" + . "Message:
" + . "
$message
" + . "

IP Address: {$_SERVER['REMOTE_ADDR']}
" + . "Time: $time
" + . "Browser: " . htmlspecialchars($_SERVER['HTTP_USER_AGENT']) . "
"; + + $message = wordwrap($message, 70); + + if (isset($GLOBALS['DEBUG_MODE']) && $GLOBALS['DEBUG_MODE'] == false) { + // send the message with mail() + mail($GLOBALS['ct_recipient'], $GLOBALS['ct_msg_subject'], $message, "From: {$GLOBALS['ct_recipient']}\r\nReply-To: {$email}\r\nContent-type: text/html; charset=UTF-8\r\nMIME-Version: 1.0"); + } + + $_SESSION['ctform']['timetosolve'] = $securimage->getTimeToSolve(); + $_SESSION['ctform']['error'] = false; // no error with form + $_SESSION['ctform']['success'] = true; // message sent + } else { + // save the entries, this is to re-populate the form + $_SESSION['ctform']['ct_name'] = $name; // save name from the form submission + $_SESSION['ctform']['ct_email'] = $email; // save email + $_SESSION['ctform']['ct_URL'] = $URL; // save URL + $_SESSION['ctform']['ct_message'] = $message; // save message + + foreach($errors as $key => $error) { + // set up error messages to display with each field + $_SESSION['ctform'][$key] = "$error"; + } + + $_SESSION['ctform']['error'] = true; // set error floag + } + } // POST +} + +$_SESSION['ctform']['success'] = false; // clear success value after running diff --git a/html/securimage/images/audio_icon.png b/html/securimage/images/audio_icon.png new file mode 100755 index 0000000..9922ef1 Binary files /dev/null and b/html/securimage/images/audio_icon.png differ diff --git a/html/securimage/images/loading.png b/html/securimage/images/loading.png new file mode 100755 index 0000000..1711568 Binary files /dev/null and b/html/securimage/images/loading.png differ diff --git a/html/securimage/images/refresh.png b/html/securimage/images/refresh.png new file mode 100755 index 0000000..f5e7d82 Binary files /dev/null and b/html/securimage/images/refresh.png differ diff --git a/html/securimage/securimage.css b/html/securimage/securimage.css new file mode 100755 index 0000000..0cffdb9 --- /dev/null +++ b/html/securimage/securimage.css @@ -0,0 +1,41 @@ +@CHARSET "UTF-8"; + +@-webkit-keyframes rotating /* Safari and Chrome */ { + from { + -ms-transform: rotate(0deg); + -moz-transform: rotate(0deg); + -webkit-transform: rotate(0deg); + -o-transform: rotate(0deg); + transform: rotate(0deg); + } + to { + -ms-transform: rotate(360deg); + -moz-transform: rotate(360deg); + -webkit-transform: rotate(360deg); + -o-transform: rotate(360deg); + transform: rotate(360deg); + } +} +@keyframes rotating { + from { + -ms-transform: rotate(0deg); + -moz-transform: rotate(0deg); + -webkit-transform: rotate(0deg); + -o-transform: rotate(0deg); + transform: rotate(0deg); + } + to { + -ms-transform: rotate(360deg); + -moz-transform: rotate(360deg); + -webkit-transform: rotate(360deg); + -o-transform: rotate(360deg); + transform: rotate(360deg); + } +} +.rotating { + -webkit-animation: rotating 1.5s linear infinite; + -moz-animation: rotating 1.5s linear infinite; + -ms-animation: rotating 1.5s linear infinite; + -o-animation: rotating 1.5s linear infinite; + animation: rotating 1.5s linear infinite; +} \ No newline at end of file diff --git a/html/securimage/securimage.js b/html/securimage/securimage.js new file mode 100755 index 0000000..481e9e6 --- /dev/null +++ b/html/securimage/securimage.js @@ -0,0 +1,252 @@ +/*! + * Securimage CAPTCHA Audio Library + * https://www.phpcaptcha.org/ + * + * Copyright 2015 phpcaptcha.org + * Released under the BSD-3 license + * See https://github.com/dapphp/securimage/blob/master/README.md + */ + +var SecurimageAudio = function(options) { + this.html5Support = true; + this.flashFallback = false; + this.captchaId = null; + this.playing = false; + this.reload = false; + this.audioElement = null; + this.controlsElement = null; + this.playButton = null; + this.playButtonImage = null; + this.loadingImage = null; + + if (options.audioElement) { + this.audioElement = document.getElementById(options.audioElement); + } + if (options.controlsElement) { + this.controlsElement = document.getElementById(options.controlsElement); + } + + this.init(); +} + +SecurimageAudio.prototype.init = function() { + var ua = navigator.userAgent.toLowerCase(); + var ieVer = (ua.indexOf('msie') != -1) ? parseInt(ua.split('msie')[1]) : false; + // ie 11+ detection + if (!ieVer && null != (ieVer = ua.match(/trident\/.*rv:(\d+\.\d+)/))) + ieVer = parseInt(ieVer[1]); + + var objAu = this.audioElement.getElementsByTagName('object'); + if (objAu.length > 0) { + objAu = objAu[0]; + } else { + objAu = null; + } + + if (ieVer) { + if (ieVer < 9) { + // no html5 audio support, hide player controls + this.controlsElement.style.display = 'none'; + this.html5Support = false; + return ; + } else if ('' == this.audioElement.canPlayType('audio/wav')) { + // check for mpeg tag - if not found then fallback to flash + var sources = this.audioElement.getElementsByTagName('source'); + var mp3support = false; + var type; + + if (objAu) { + this.flashFallback = true; + } + + for (var i = 0; i < sources.length; ++i) { + type = sources[i].attributes["type"].value; + if (type.toLowerCase().indexOf('mpeg') >= 0 || type.toLowerCase().indexOf('mp3') >= 0) { + mp3support = true; + break; + } + } + + if (false == mp3support) { + // browser supports