# HG changeset patch # User xilinxu # Date 1408005891 14400 # Node ID abdbc8fe98ddfc656682ef0ca2fec2c1499fe2b9 # Parent ce5a8082bbb8c4f7711aed58c0a241e3d7dd560d Deleted selected files diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/COPYING --- a/bwa-0.7.9a/COPYING Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,674 +0,0 @@ - GNU GENERAL PUBLIC LICENSE - Version 3, 29 June 2007 - - Copyright (C) 2007 Free Software Foundation, Inc. - Everyone is permitted to copy and distribute verbatim copies - of this license document, but changing it is not allowed. - - Preamble - - The GNU General Public License is a free, copyleft license for -software and other kinds of works. - - The licenses for most software and other practical works are designed -to take away your freedom to share and change the works. By contrast, -the GNU General Public License is intended to guarantee your freedom to -share and change all versions of a program--to make sure it remains free -software for all its users. We, the Free Software Foundation, use the -GNU General Public License for most of our software; it applies also to -any other work released this way by its authors. You can apply it to -your programs, too. - - When we speak of free software, we are referring to freedom, not -price. Our General Public Licenses are designed to make sure that you -have the freedom to distribute copies of free software (and charge for -them if you wish), that you receive source code or can get it if you -want it, that you can change the software or use pieces of it in new -free programs, and that you know you can do these things. - - To protect your rights, we need to prevent others from denying you -these rights or asking you to surrender the rights. Therefore, you have -certain responsibilities if you distribute copies of the software, or if -you modify it: responsibilities to respect the freedom of others. - - For example, if you distribute copies of such a program, whether -gratis or for a fee, you must pass on to the recipients the same -freedoms that you received. You must make sure that they, too, receive -or can get the source code. And you must show them these terms so they -know their rights. - - Developers that use the GNU GPL protect your rights with two steps: -(1) assert copyright on the software, and (2) offer you this License -giving you legal permission to copy, distribute and/or modify it. - - For the developers' and authors' protection, the GPL clearly explains -that there is no warranty for this free software. For both users' and -authors' sake, the GPL requires that modified versions be marked as -changed, so that their problems will not be attributed erroneously to -authors of previous versions. - - Some devices are designed to deny users access to install or run -modified versions of the software inside them, although the manufacturer -can do so. This is fundamentally incompatible with the aim of -protecting users' freedom to change the software. The systematic -pattern of such abuse occurs in the area of products for individuals to -use, which is precisely where it is most unacceptable. Therefore, we -have designed this version of the GPL to prohibit the practice for those -products. If such problems arise substantially in other domains, we -stand ready to extend this provision to those domains in future versions -of the GPL, as needed to protect the freedom of users. - - Finally, every program is threatened constantly by software patents. -States should not allow patents to restrict development and use of -software on general-purpose computers, but in those that do, we wish to -avoid the special danger that patents applied to a free program could -make it effectively proprietary. To prevent this, the GPL assures that -patents cannot be used to render the program non-free. - - The precise terms and conditions for copying, distribution and -modification follow. - - TERMS AND CONDITIONS - - 0. Definitions. - - "This License" refers to version 3 of the GNU General Public License. - - "Copyright" also means copyright-like laws that apply to other kinds of -works, such as semiconductor masks. - - "The Program" refers to any copyrightable work licensed under this -License. Each licensee is addressed as "you". "Licensees" and -"recipients" may be individuals or organizations. - - To "modify" a work means to copy from or adapt all or part of the work -in a fashion requiring copyright permission, other than the making of an -exact copy. The resulting work is called a "modified version" of the -earlier work or a work "based on" the earlier work. - - A "covered work" means either the unmodified Program or a work based -on the Program. - - To "propagate" a work means to do anything with it that, without -permission, would make you directly or secondarily liable for -infringement under applicable copyright law, except executing it on a -computer or modifying a private copy. Propagation includes copying, -distribution (with or without modification), making available to the -public, and in some countries other activities as well. - - To "convey" a work means any kind of propagation that enables other -parties to make or receive copies. Mere interaction with a user through -a computer network, with no transfer of a copy, is not conveying. - - An interactive user interface displays "Appropriate Legal Notices" -to the extent that it includes a convenient and prominently visible -feature that (1) displays an appropriate copyright notice, and (2) -tells the user that there is no warranty for the work (except to the -extent that warranties are provided), that licensees may convey the -work under this License, and how to view a copy of this License. If -the interface presents a list of user commands or options, such as a -menu, a prominent item in the list meets this criterion. - - 1. Source Code. - - The "source code" for a work means the preferred form of the work -for making modifications to it. "Object code" means any non-source -form of a work. - - A "Standard Interface" means an interface that either is an official -standard defined by a recognized standards body, or, in the case of -interfaces specified for a particular programming language, one that -is widely used among developers working in that language. - - The "System Libraries" of an executable work include anything, other -than the work as a whole, that (a) is included in the normal form of -packaging a Major Component, but which is not part of that Major -Component, and (b) serves only to enable use of the work with that -Major Component, or to implement a Standard Interface for which an -implementation is available to the public in source code form. A -"Major Component", in this context, means a major essential component -(kernel, window system, and so on) of the specific operating system -(if any) on which the executable work runs, or a compiler used to -produce the work, or an object code interpreter used to run it. - - The "Corresponding Source" for a work in object code form means all -the source code needed to generate, install, and (for an executable -work) run the object code and to modify the work, including scripts to -control those activities. However, it does not include the work's -System Libraries, or general-purpose tools or generally available free -programs which are used unmodified in performing those activities but -which are not part of the work. For example, Corresponding Source -includes interface definition files associated with source files for -the work, and the source code for shared libraries and dynamically -linked subprograms that the work is specifically designed to require, -such as by intimate data communication or control flow between those -subprograms and other parts of the work. - - The Corresponding Source need not include anything that users -can regenerate automatically from other parts of the Corresponding -Source. - - The Corresponding Source for a work in source code form is that -same work. - - 2. Basic Permissions. - - All rights granted under this License are granted for the term of -copyright on the Program, and are irrevocable provided the stated -conditions are met. This License explicitly affirms your unlimited -permission to run the unmodified Program. The output from running a -covered work is covered by this License only if the output, given its -content, constitutes a covered work. This License acknowledges your -rights of fair use or other equivalent, as provided by copyright law. - - You may make, run and propagate covered works that you do not -convey, without conditions so long as your license otherwise remains -in force. You may convey covered works to others for the sole purpose -of having them make modifications exclusively for you, or provide you -with facilities for running those works, provided that you comply with -the terms of this License in conveying all material for which you do -not control copyright. Those thus making or running the covered works -for you must do so exclusively on your behalf, under your direction -and control, on terms that prohibit them from making any copies of -your copyrighted material outside their relationship with you. - - Conveying under any other circumstances is permitted solely under -the conditions stated below. Sublicensing is not allowed; section 10 -makes it unnecessary. - - 3. Protecting Users' Legal Rights From Anti-Circumvention Law. - - No covered work shall be deemed part of an effective technological -measure under any applicable law fulfilling obligations under article -11 of the WIPO copyright treaty adopted on 20 December 1996, or -similar laws prohibiting or restricting circumvention of such -measures. - - When you convey a covered work, you waive any legal power to forbid -circumvention of technological measures to the extent such circumvention -is effected by exercising rights under this License with respect to -the covered work, and you disclaim any intention to limit operation or -modification of the work as a means of enforcing, against the work's -users, your or third parties' legal rights to forbid circumvention of -technological measures. - - 4. Conveying Verbatim Copies. - - You may convey verbatim copies of the Program's source code as you -receive it, in any medium, provided that you conspicuously and -appropriately publish on each copy an appropriate copyright notice; -keep intact all notices stating that this License and any -non-permissive terms added in accord with section 7 apply to the code; -keep intact all notices of the absence of any warranty; and give all -recipients a copy of this License along with the Program. - - You may charge any price or no price for each copy that you convey, -and you may offer support or warranty protection for a fee. - - 5. Conveying Modified Source Versions. - - You may convey a work based on the Program, or the modifications to -produce it from the Program, in the form of source code under the -terms of section 4, provided that you also meet all of these conditions: - - a) The work must carry prominent notices stating that you modified - it, and giving a relevant date. - - b) The work must carry prominent notices stating that it is - released under this License and any conditions added under section - 7. This requirement modifies the requirement in section 4 to - "keep intact all notices". - - c) You must license the entire work, as a whole, under this - License to anyone who comes into possession of a copy. This - License will therefore apply, along with any applicable section 7 - additional terms, to the whole of the work, and all its parts, - regardless of how they are packaged. This License gives no - permission to license the work in any other way, but it does not - invalidate such permission if you have separately received it. - - d) If the work has interactive user interfaces, each must display - Appropriate Legal Notices; however, if the Program has interactive - interfaces that do not display Appropriate Legal Notices, your - work need not make them do so. - - A compilation of a covered work with other separate and independent -works, which are not by their nature extensions of the covered work, -and which are not combined with it such as to form a larger program, -in or on a volume of a storage or distribution medium, is called an -"aggregate" if the compilation and its resulting copyright are not -used to limit the access or legal rights of the compilation's users -beyond what the individual works permit. Inclusion of a covered work -in an aggregate does not cause this License to apply to the other -parts of the aggregate. - - 6. Conveying Non-Source Forms. - - You may convey a covered work in object code form under the terms -of sections 4 and 5, provided that you also convey the -machine-readable Corresponding Source under the terms of this License, -in one of these ways: - - a) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by the - Corresponding Source fixed on a durable physical medium - customarily used for software interchange. - - b) Convey the object code in, or embodied in, a physical product - (including a physical distribution medium), accompanied by a - written offer, valid for at least three years and valid for as - long as you offer spare parts or customer support for that product - model, to give anyone who possesses the object code either (1) a - copy of the Corresponding Source for all the software in the - product that is covered by this License, on a durable physical - medium customarily used for software interchange, for a price no - more than your reasonable cost of physically performing this - conveying of source, or (2) access to copy the - Corresponding Source from a network server at no charge. - - c) Convey individual copies of the object code with a copy of the - written offer to provide the Corresponding Source. This - alternative is allowed only occasionally and noncommercially, and - only if you received the object code with such an offer, in accord - with subsection 6b. - - d) Convey the object code by offering access from a designated - place (gratis or for a charge), and offer equivalent access to the - Corresponding Source in the same way through the same place at no - further charge. You need not require recipients to copy the - Corresponding Source along with the object code. If the place to - copy the object code is a network server, the Corresponding Source - may be on a different server (operated by you or a third party) - that supports equivalent copying facilities, provided you maintain - clear directions next to the object code saying where to find the - Corresponding Source. Regardless of what server hosts the - Corresponding Source, you remain obligated to ensure that it is - available for as long as needed to satisfy these requirements. - - e) Convey the object code using peer-to-peer transmission, provided - you inform other peers where the object code and Corresponding - Source of the work are being offered to the general public at no - charge under subsection 6d. - - A separable portion of the object code, whose source code is excluded -from the Corresponding Source as a System Library, need not be -included in conveying the object code work. - - A "User Product" is either (1) a "consumer product", which means any -tangible personal property which is normally used for personal, family, -or household purposes, or (2) anything designed or sold for incorporation -into a dwelling. In determining whether a product is a consumer product, -doubtful cases shall be resolved in favor of coverage. For a particular -product received by a particular user, "normally used" refers to a -typical or common use of that class of product, regardless of the status -of the particular user or of the way in which the particular user -actually uses, or expects or is expected to use, the product. A product -is a consumer product regardless of whether the product has substantial -commercial, industrial or non-consumer uses, unless such uses represent -the only significant mode of use of the product. - - "Installation Information" for a User Product means any methods, -procedures, authorization keys, or other information required to install -and execute modified versions of a covered work in that User Product from -a modified version of its Corresponding Source. The information must -suffice to ensure that the continued functioning of the modified object -code is in no case prevented or interfered with solely because -modification has been made. - - If you convey an object code work under this section in, or with, or -specifically for use in, a User Product, and the conveying occurs as -part of a transaction in which the right of possession and use of the -User Product is transferred to the recipient in perpetuity or for a -fixed term (regardless of how the transaction is characterized), the -Corresponding Source conveyed under this section must be accompanied -by the Installation Information. But this requirement does not apply -if neither you nor any third party retains the ability to install -modified object code on the User Product (for example, the work has -been installed in ROM). - - The requirement to provide Installation Information does not include a -requirement to continue to provide support service, warranty, or updates -for a work that has been modified or installed by the recipient, or for -the User Product in which it has been modified or installed. Access to a -network may be denied when the modification itself materially and -adversely affects the operation of the network or violates the rules and -protocols for communication across the network. - - Corresponding Source conveyed, and Installation Information provided, -in accord with this section must be in a format that is publicly -documented (and with an implementation available to the public in -source code form), and must require no special password or key for -unpacking, reading or copying. - - 7. Additional Terms. - - "Additional permissions" are terms that supplement the terms of this -License by making exceptions from one or more of its conditions. -Additional permissions that are applicable to the entire Program shall -be treated as though they were included in this License, to the extent -that they are valid under applicable law. If additional permissions -apply only to part of the Program, that part may be used separately -under those permissions, but the entire Program remains governed by -this License without regard to the additional permissions. - - When you convey a copy of a covered work, you may at your option -remove any additional permissions from that copy, or from any part of -it. (Additional permissions may be written to require their own -removal in certain cases when you modify the work.) You may place -additional permissions on material, added by you to a covered work, -for which you have or can give appropriate copyright permission. - - Notwithstanding any other provision of this License, for material you -add to a covered work, you may (if authorized by the copyright holders of -that material) supplement the terms of this License with terms: - - a) Disclaiming warranty or limiting liability differently from the - terms of sections 15 and 16 of this License; or - - b) Requiring preservation of specified reasonable legal notices or - author attributions in that material or in the Appropriate Legal - Notices displayed by works containing it; or - - c) Prohibiting misrepresentation of the origin of that material, or - requiring that modified versions of such material be marked in - reasonable ways as different from the original version; or - - d) Limiting the use for publicity purposes of names of licensors or - authors of the material; or - - e) Declining to grant rights under trademark law for use of some - trade names, trademarks, or service marks; or - - f) Requiring indemnification of licensors and authors of that - material by anyone who conveys the material (or modified versions of - it) with contractual assumptions of liability to the recipient, for - any liability that these contractual assumptions directly impose on - those licensors and authors. - - All other non-permissive additional terms are considered "further -restrictions" within the meaning of section 10. If the Program as you -received it, or any part of it, contains a notice stating that it is -governed by this License along with a term that is a further -restriction, you may remove that term. If a license document contains -a further restriction but permits relicensing or conveying under this -License, you may add to a covered work material governed by the terms -of that license document, provided that the further restriction does -not survive such relicensing or conveying. - - If you add terms to a covered work in accord with this section, you -must place, in the relevant source files, a statement of the -additional terms that apply to those files, or a notice indicating -where to find the applicable terms. - - Additional terms, permissive or non-permissive, may be stated in the -form of a separately written license, or stated as exceptions; -the above requirements apply either way. - - 8. Termination. - - You may not propagate or modify a covered work except as expressly -provided under this License. Any attempt otherwise to propagate or -modify it is void, and will automatically terminate your rights under -this License (including any patent licenses granted under the third -paragraph of section 11). - - However, if you cease all violation of this License, then your -license from a particular copyright holder is reinstated (a) -provisionally, unless and until the copyright holder explicitly and -finally terminates your license, and (b) permanently, if the copyright -holder fails to notify you of the violation by some reasonable means -prior to 60 days after the cessation. - - Moreover, your license from a particular copyright holder is -reinstated permanently if the copyright holder notifies you of the -violation by some reasonable means, this is the first time you have -received notice of violation of this License (for any work) from that -copyright holder, and you cure the violation prior to 30 days after -your receipt of the notice. - - Termination of your rights under this section does not terminate the -licenses of parties who have received copies or rights from you under -this License. If your rights have been terminated and not permanently -reinstated, you do not qualify to receive new licenses for the same -material under section 10. - - 9. Acceptance Not Required for Having Copies. - - You are not required to accept this License in order to receive or -run a copy of the Program. Ancillary propagation of a covered work -occurring solely as a consequence of using peer-to-peer transmission -to receive a copy likewise does not require acceptance. However, -nothing other than this License grants you permission to propagate or -modify any covered work. These actions infringe copyright if you do -not accept this License. Therefore, by modifying or propagating a -covered work, you indicate your acceptance of this License to do so. - - 10. Automatic Licensing of Downstream Recipients. - - Each time you convey a covered work, the recipient automatically -receives a license from the original licensors, to run, modify and -propagate that work, subject to this License. You are not responsible -for enforcing compliance by third parties with this License. - - An "entity transaction" is a transaction transferring control of an -organization, or substantially all assets of one, or subdividing an -organization, or merging organizations. If propagation of a covered -work results from an entity transaction, each party to that -transaction who receives a copy of the work also receives whatever -licenses to the work the party's predecessor in interest had or could -give under the previous paragraph, plus a right to possession of the -Corresponding Source of the work from the predecessor in interest, if -the predecessor has it or can get it with reasonable efforts. - - You may not impose any further restrictions on the exercise of the -rights granted or affirmed under this License. For example, you may -not impose a license fee, royalty, or other charge for exercise of -rights granted under this License, and you may not initiate litigation -(including a cross-claim or counterclaim in a lawsuit) alleging that -any patent claim is infringed by making, using, selling, offering for -sale, or importing the Program or any portion of it. - - 11. Patents. - - A "contributor" is a copyright holder who authorizes use under this -License of the Program or a work on which the Program is based. The -work thus licensed is called the contributor's "contributor version". - - A contributor's "essential patent claims" are all patent claims -owned or controlled by the contributor, whether already acquired or -hereafter acquired, that would be infringed by some manner, permitted -by this License, of making, using, or selling its contributor version, -but do not include claims that would be infringed only as a -consequence of further modification of the contributor version. For -purposes of this definition, "control" includes the right to grant -patent sublicenses in a manner consistent with the requirements of -this License. - - Each contributor grants you a non-exclusive, worldwide, royalty-free -patent license under the contributor's essential patent claims, to -make, use, sell, offer for sale, import and otherwise run, modify and -propagate the contents of its contributor version. - - In the following three paragraphs, a "patent license" is any express -agreement or commitment, however denominated, not to enforce a patent -(such as an express permission to practice a patent or covenant not to -sue for patent infringement). To "grant" such a patent license to a -party means to make such an agreement or commitment not to enforce a -patent against the party. - - If you convey a covered work, knowingly relying on a patent license, -and the Corresponding Source of the work is not available for anyone -to copy, free of charge and under the terms of this License, through a -publicly available network server or other readily accessible means, -then you must either (1) cause the Corresponding Source to be so -available, or (2) arrange to deprive yourself of the benefit of the -patent license for this particular work, or (3) arrange, in a manner -consistent with the requirements of this License, to extend the patent -license to downstream recipients. "Knowingly relying" means you have -actual knowledge that, but for the patent license, your conveying the -covered work in a country, or your recipient's use of the covered work -in a country, would infringe one or more identifiable patents in that -country that you have reason to believe are valid. - - If, pursuant to or in connection with a single transaction or -arrangement, you convey, or propagate by procuring conveyance of, a -covered work, and grant a patent license to some of the parties -receiving the covered work authorizing them to use, propagate, modify -or convey a specific copy of the covered work, then the patent license -you grant is automatically extended to all recipients of the covered -work and works based on it. - - A patent license is "discriminatory" if it does not include within -the scope of its coverage, prohibits the exercise of, or is -conditioned on the non-exercise of one or more of the rights that are -specifically granted under this License. You may not convey a covered -work if you are a party to an arrangement with a third party that is -in the business of distributing software, under which you make payment -to the third party based on the extent of your activity of conveying -the work, and under which the third party grants, to any of the -parties who would receive the covered work from you, a discriminatory -patent license (a) in connection with copies of the covered work -conveyed by you (or copies made from those copies), or (b) primarily -for and in connection with specific products or compilations that -contain the covered work, unless you entered into that arrangement, -or that patent license was granted, prior to 28 March 2007. - - Nothing in this License shall be construed as excluding or limiting -any implied license or other defenses to infringement that may -otherwise be available to you under applicable patent law. - - 12. No Surrender of Others' Freedom. - - If conditions are imposed on you (whether by court order, agreement or -otherwise) that contradict the conditions of this License, they do not -excuse you from the conditions of this License. If you cannot convey a -covered work so as to satisfy simultaneously your obligations under this -License and any other pertinent obligations, then as a consequence you may -not convey it at all. For example, if you agree to terms that obligate you -to collect a royalty for further conveying from those to whom you convey -the Program, the only way you could satisfy both those terms and this -License would be to refrain entirely from conveying the Program. - - 13. Use with the GNU Affero General Public License. - - Notwithstanding any other provision of this License, you have -permission to link or combine any covered work with a work licensed -under version 3 of the GNU Affero General Public License into a single -combined work, and to convey the resulting work. The terms of this -License will continue to apply to the part which is the covered work, -but the special requirements of the GNU Affero General Public License, -section 13, concerning interaction through a network will apply to the -combination as such. - - 14. Revised Versions of this License. - - The Free Software Foundation may publish revised and/or new versions of -the GNU General Public License from time to time. Such new versions will -be similar in spirit to the present version, but may differ in detail to -address new problems or concerns. - - Each version is given a distinguishing version number. If the -Program specifies that a certain numbered version of the GNU General -Public License "or any later version" applies to it, you have the -option of following the terms and conditions either of that numbered -version or of any later version published by the Free Software -Foundation. If the Program does not specify a version number of the -GNU General Public License, you may choose any version ever published -by the Free Software Foundation. - - If the Program specifies that a proxy can decide which future -versions of the GNU General Public License can be used, that proxy's -public statement of acceptance of a version permanently authorizes you -to choose that version for the Program. - - Later license versions may give you additional or different -permissions. However, no additional obligations are imposed on any -author or copyright holder as a result of your choosing to follow a -later version. - - 15. Disclaimer of Warranty. - - THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY -APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT -HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY -OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, -THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM -IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF -ALL NECESSARY SERVICING, REPAIR OR CORRECTION. - - 16. Limitation of Liability. - - IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING -WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS -THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY -GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE -USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF -DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD -PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), -EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF -SUCH DAMAGES. - - 17. Interpretation of Sections 15 and 16. - - If the disclaimer of warranty and limitation of liability provided -above cannot be given local legal effect according to their terms, -reviewing courts shall apply local law that most closely approximates -an absolute waiver of all civil liability in connection with the -Program, unless a warranty or assumption of liability accompanies a -copy of the Program in return for a fee. - - END OF TERMS AND CONDITIONS - - How to Apply These Terms to Your New Programs - - If you develop a new program, and you want it to be of the greatest -possible use to the public, the best way to achieve this is to make it -free software which everyone can redistribute and change under these terms. - - To do so, attach the following notices to the program. It is safest -to attach them to the start of each source file to most effectively -state the exclusion of warranty; and each file should have at least -the "copyright" line and a pointer to where the full notice is found. - - - Copyright (C) - - This program is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 3 of the License, or - (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program. If not, see . - -Also add information on how to contact you by electronic and paper mail. - - If the program does terminal interaction, make it output a short -notice like this when it starts in an interactive mode: - - Copyright (C) - This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. - This is free software, and you are welcome to redistribute it - under certain conditions; type `show c' for details. - -The hypothetical commands `show w' and `show c' should show the appropriate -parts of the General Public License. Of course, your program's commands -might be different; for a GUI interface, you would use an "about box". - - You should also get your employer (if you work as a programmer) or school, -if any, to sign a "copyright disclaimer" for the program, if necessary. -For more information on this, and how to apply and follow the GNU GPL, see -. - - The GNU General Public License does not permit incorporating your program -into proprietary programs. If your program is a subroutine library, you -may consider it more useful to permit linking proprietary applications with -the library. If this is what you want to do, use the GNU Lesser General -Public License instead of this License. But first, please read -. diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/ChangeLog --- a/bwa-0.7.9a/ChangeLog Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,3864 +0,0 @@ ------------------------------------------------------------------------- -r1605 | lh3 | 2010-12-29 20:20:20 -0500 (Wed, 29 Dec 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.5.9rc1-2 (r1605) - * fixed a typo/bug in bwasw - ------------------------------------------------------------------------- -r1587 | lh3 | 2010-12-21 18:48:30 -0500 (Tue, 21 Dec 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -a typo in the manual - ------------------------------------------------------------------------- -r1586 | lh3 | 2010-12-21 18:47:48 -0500 (Tue, 21 Dec 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/utils.c - M /branches/prog/bwa/utils.h - - * bwa-0.5.9rc1-1 (r1586) - * a few patches by John - ------------------------------------------------------------------------- -r1562 | lh3 | 2010-12-10 01:02:06 -0500 (Fri, 10 Dec 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - -documentation on specifying @RG - ------------------------------------------------------------------------- -r1561 | lh3 | 2010-12-10 00:45:40 -0500 (Fri, 10 Dec 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.5.9rc1 (r1561) - ------------------------------------------------------------------------- -r1560 | lh3 | 2010-12-10 00:29:08 -0500 (Fri, 10 Dec 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/main.c - - * fixed a small memory leak caused by the BAM reader - * fixed a memory violation, also in the BAM reader - ------------------------------------------------------------------------- -r1559 | lh3 | 2010-12-10 00:10:48 -0500 (Fri, 10 Dec 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/Makefile - -change Makefile gcc options - ------------------------------------------------------------------------- -r1558 | lh3 | 2010-12-10 00:09:22 -0500 (Fri, 10 Dec 2010) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.5.8-6 (r1557) - * added a little more comments to BWA-SW - * randomly choosing a mapping if there are more than one - ------------------------------------------------------------------------- -r1557 | lh3 | 2010-12-09 21:58:00 -0500 (Thu, 09 Dec 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2_aux.c - -sometimes unmapped reads may not be printed... - ------------------------------------------------------------------------- -r1556 | lh3 | 2010-12-09 21:50:26 -0500 (Thu, 09 Dec 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2_aux.c - -print unmapped reads - ------------------------------------------------------------------------- -r1555 | lh3 | 2010-12-09 21:17:20 -0500 (Thu, 09 Dec 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.5.8-5 (r1555) - * BAM input documentation - ------------------------------------------------------------------------- -r1544 | lh3 | 2010-11-23 11:01:41 -0500 (Tue, 23 Nov 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.5.8-4 (r1544) - * supporting adding RG tags and RG lines - ------------------------------------------------------------------------- -r1543 | lh3 | 2010-11-23 00:16:40 -0500 (Tue, 23 Nov 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.5.8-3 (r1543) - * fixed a memory leak - ------------------------------------------------------------------------- -r1542 | lh3 | 2010-11-22 23:50:56 -0500 (Mon, 22 Nov 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.5.8-2 (r1542) - * fixed a long existing bug in random placement of reads - ------------------------------------------------------------------------- -r1541 | lh3 | 2010-11-22 23:27:29 -0500 (Mon, 22 Nov 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - A /branches/prog/bwa/bamlite.c - A /branches/prog/bwa/bamlite.h - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - -preliminary BAM input support - ------------------------------------------------------------------------- -r1537 | lh3 | 2010-10-16 23:46:20 -0400 (Sat, 16 Oct 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwa.1 - -change version number and ChangeLog - ------------------------------------------------------------------------- -r1536 | lh3 | 2010-10-16 23:35:10 -0400 (Sat, 16 Oct 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - - * fixed a bug in the scoring matrix - * release bwa-0.5.8c (r1536) - ------------------------------------------------------------------------- -r1451 | lh3 | 2010-06-15 09:43:52 -0400 (Tue, 15 Jun 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -version change - ------------------------------------------------------------------------- -r1450 | lh3 | 2010-06-15 09:42:21 -0400 (Tue, 15 Jun 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - - * bwa-0.5.8b (r1450) - * fixed a bug in scoring matrix - ------------------------------------------------------------------------- -r1445 | lh3 | 2010-06-11 08:58:33 -0400 (Fri, 11 Jun 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - -fixed a serious bug - ------------------------------------------------------------------------- -r1442 | lh3 | 2010-06-08 10:22:14 -0400 (Tue, 08 Jun 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.5.8 (r1442) - ------------------------------------------------------------------------- -r1440 | lh3 | 2010-05-19 13:43:50 -0400 (Wed, 19 May 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-r1440 - * sorry, forget to remove a debugging line - ------------------------------------------------------------------------- -r1439 | lh3 | 2010-05-19 13:43:08 -0400 (Wed, 19 May 2010) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-r1439 - * fixed a bug in bwasw caused by a recent modification - * throwing insane insert size when estimating isize - ------------------------------------------------------------------------- -r1425 | lh3 | 2010-04-29 15:15:23 -0400 (Thu, 29 Apr 2010) | 10 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.5.7-7 (r1425) - * fixed a minor bug in bwasw command-line parsing - * When band-width is not large enough, bwasw may find two highly - overlapping but not completely overlapping alignments. The old - version will filter out one of them, which leads to false - negatives. The current outputs both. This solution is obviously not - ideal. The ideal one would be to increase the band-width and redo the - alignment. - - ------------------------------------------------------------------------- -r1399 | lh3 | 2010-04-16 09:20:49 -0400 (Fri, 16 Apr 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.5.7-6 (r1399) - * fixed a typo/bug (by Vaughn Iverson) - ------------------------------------------------------------------------- -r1329 | lh3 | 2010-03-19 23:32:46 -0400 (Fri, 19 Mar 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - -small correction - ------------------------------------------------------------------------- -r1328 | lh3 | 2010-03-19 23:28:44 -0400 (Fri, 19 Mar 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.5.7-4 (r1328) - * automatically adjust ap_prior based on alignment - ------------------------------------------------------------------------- -r1327 | lh3 | 2010-03-19 23:02:40 -0400 (Fri, 19 Mar 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.5.7-3 (r1327) - * evaluate hits obtained from SW alignment in a more proper way. - ------------------------------------------------------------------------- -r1320 | lh3 | 2010-03-17 15:13:22 -0400 (Wed, 17 Mar 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/bwape.c - -fixed a potential out-of-boundary error. Need more testing. - ------------------------------------------------------------------------- -r1319 | lh3 | 2010-03-14 22:44:46 -0400 (Sun, 14 Mar 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/bwape.c - -insert size is `weird' if the 3rd quatile larger than 100,000bp - ------------------------------------------------------------------------- -r1318 | lh3 | 2010-03-14 22:37:35 -0400 (Sun, 14 Mar 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.5.7-2 (r1318) - * in sampe, allow to disable insert size estimate - ------------------------------------------------------------------------- -r1317 | lh3 | 2010-03-14 22:14:14 -0400 (Sun, 14 Mar 2010) | 5 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/solid2fastq.pl - - * bwa-0.5.7-1 (r1317) - * fixed a potential bug in solid2fastq.pl - * fixed a bug in calculating mapping quality (by Rodrigo Goya) - * fixed a very rare bug (if ever occur) about pairing - ------------------------------------------------------------------------- -r1310 | lh3 | 2010-03-01 10:35:45 -0500 (Mon, 01 Mar 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.5.7 - ------------------------------------------------------------------------- -r1309 | lh3 | 2010-02-26 21:42:22 -0500 (Fri, 26 Feb 2010) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.5.6-2 (r1309) - * fixed an unfixed bug (by Carol Scott) - * fixed some tiny formatting - ------------------------------------------------------------------------- -r1305 | lh3 | 2010-02-25 13:47:58 -0500 (Thu, 25 Feb 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.5.6-1 (r1304) - * optionally write output to a file (by Tim Fennel) - ------------------------------------------------------------------------- -r1303 | lh3 | 2010-02-10 23:43:48 -0500 (Wed, 10 Feb 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - -Release bwa-0.5.6 - ------------------------------------------------------------------------- -r1302 | lh3 | 2010-02-10 11:11:49 -0500 (Wed, 10 Feb 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.5.5-10 (r1302) - * improve max insert size estimate (method suggested by Gerton Lunter) - ------------------------------------------------------------------------- -r1301 | lh3 | 2010-02-09 16:15:28 -0500 (Tue, 09 Feb 2010) | 5 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.5.5-9 (r1301) - * improve mapping quality calculation for abnomalous pairs - * fixed a bug in multiple hits - * SOLiD multiple hits should work now - ------------------------------------------------------------------------- -r1300 | lh3 | 2010-02-09 12:50:02 -0500 (Tue, 09 Feb 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.5.5-8 (r1300) - * output kurtosis - ------------------------------------------------------------------------- -r1299 | lh3 | 2010-02-09 12:33:34 -0500 (Tue, 09 Feb 2010) | 5 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.5.5-7 (r1299) - * calculate skewness in sampe - * increase min_len in SW to 20 - * perform more SW to fix discordant pairs - ------------------------------------------------------------------------- -r1298 | lh3 | 2010-02-08 12:40:31 -0500 (Mon, 08 Feb 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/cs2nt.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.5.5-6 (r1297) - * prepare to replace all 16-bit CIGAR (patches by Rodrigo Goya) - ------------------------------------------------------------------------- -r1297 | lh3 | 2010-02-05 22:26:11 -0500 (Fri, 05 Feb 2010) | 2 lines -Changed paths: - M /branches/prog/bwa/solid2fastq.pl - -the old fix seems not working! - ------------------------------------------------------------------------- -r1296 | lh3 | 2010-02-05 21:51:03 -0500 (Fri, 05 Feb 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.5.5-5 (r1296) - * fixed a minor issue that the lower bound of insert size is not correctly set. - ------------------------------------------------------------------------- -r1295 | lh3 | 2010-02-05 21:01:10 -0500 (Fri, 05 Feb 2010) | 5 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/main.c - - * bwa-0.5.5-4 (r1295) - * fixed a memory leak - * change the behaviour of -n (samse and sampe) - * change the default of -n - ------------------------------------------------------------------------- -r1294 | lh3 | 2010-02-05 17:24:06 -0500 (Fri, 05 Feb 2010) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.5.5-3 (r1294) - * improved multi-hit report - ------------------------------------------------------------------------- -r1293 | lh3 | 2010-02-05 12:57:38 -0500 (Fri, 05 Feb 2010) | 5 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/cs2nt.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/solid2fastq.pl - - * bwa-0.5.5-2 (r1293) - * bugfix: truncated quality string - * bugfix: quality -1 in solid->fastq conversion - * bugfix: color reads on the reverse strand is not complemented - ------------------------------------------------------------------------- -r1279 | lh3 | 2009-11-23 22:42:34 -0500 (Mon, 23 Nov 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bntseq.h - M /branches/prog/bwa/bwase.c - A /branches/prog/bwa/bwase.h - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.5.5-1 (r1279) - * incorporate changes from Matt Hanna for Java bindings. - ------------------------------------------------------------------------- -r1275 | lh3 | 2009-11-10 22:13:10 -0500 (Tue, 10 Nov 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - -update ChangeLog - ------------------------------------------------------------------------- -r1273 | lh3 | 2009-11-10 22:08:16 -0500 (Tue, 10 Nov 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - A /branches/prog/bwa/qualfa2fq.pl - -Release bwa-0.5.5 (r1273) - ------------------------------------------------------------------------- -r1272 | lh3 | 2009-11-10 22:02:50 -0500 (Tue, 10 Nov 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.5.4-3 (r1272) - * fixed another typo which may lead to incorrect single-end mapping quality - ------------------------------------------------------------------------- -r1271 | lh3 | 2009-11-10 21:59:47 -0500 (Tue, 10 Nov 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.5.4-2 (r1271) - * fixed a serious typo/bug which does not hurt if we allow one gap open - and work with <200bp reads, but causes segfault for long reads. - ------------------------------------------------------------------------- -r1270 | lh3 | 2009-11-09 23:12:42 -0500 (Mon, 09 Nov 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/cs2nt.c - M /branches/prog/bwa/main.c - - * bwa-0.5.4-1 (r1270) - * fixed a bug in color alignment - ------------------------------------------------------------------------- -r1245 | lh3 | 2009-10-09 07:42:52 -0400 (Fri, 09 Oct 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/main.c - -Release bwa-0.5.4 - ------------------------------------------------------------------------- -r1244 | lh3 | 2009-10-09 05:53:52 -0400 (Fri, 09 Oct 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - - * bwa-0.5.3-4 (r1244) - * output the clipped length in XC:i: tag - * skip mate alignment when stdaln is buggy - * fixed a bug in NM:i: tag - ------------------------------------------------------------------------- -r1243 | lh3 | 2009-10-07 08:15:04 -0400 (Wed, 07 Oct 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.5.3-3 (r1243) - * sampe: fixed a bug when a read sequence is identical its reverse complement. - ------------------------------------------------------------------------- -r1242 | lh3 | 2009-10-07 07:49:13 -0400 (Wed, 07 Oct 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.5.3-2 (r1242) - * sampe: optionall preload the full index into memory - * aln: change the default seed length to 32bp - ------------------------------------------------------------------------- -r1238 | lh3 | 2009-09-26 18:38:15 -0400 (Sat, 26 Sep 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/khash.h - -Improve portability of khash.h - ------------------------------------------------------------------------- -r1228 | lh3 | 2009-09-15 09:20:22 -0400 (Tue, 15 Sep 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/main.c - -fixed a typo - ------------------------------------------------------------------------- -r1227 | lh3 | 2009-09-15 09:19:35 -0400 (Tue, 15 Sep 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.5.3-1 (r1226) - * in dBWT-SW, optionall use hard clipping instead of soft clipping - ------------------------------------------------------------------------- -r1225 | lh3 | 2009-09-15 08:32:30 -0400 (Tue, 15 Sep 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - -Release bwa-0.5.3 (r1225) - ------------------------------------------------------------------------- -r1223 | lh3 | 2009-09-13 07:30:41 -0400 (Sun, 13 Sep 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.5.2 - ------------------------------------------------------------------------- -r1222 | lh3 | 2009-09-11 09:11:39 -0400 (Fri, 11 Sep 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.5.1-5 (r1222) - * fixed a typo. No real change - ------------------------------------------------------------------------- -r1221 | lh3 | 2009-09-11 09:09:44 -0400 (Fri, 11 Sep 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.5.1-4 (r1221) - * trim reads before alignment - ------------------------------------------------------------------------- -r1216 | lh3 | 2009-09-08 17:50:15 -0400 (Tue, 08 Sep 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.5.1-3 (r1216) - * fixed a bug about NM tags for gapped alignment - * print SAM header - ------------------------------------------------------------------------- -r1215 | lh3 | 2009-09-08 17:14:42 -0400 (Tue, 08 Sep 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.5.1-2 (r1215) - * fixed a bug when read lengths vary (by John Marshall) - ------------------------------------------------------------------------- -r1213 | lh3 | 2009-09-06 18:58:15 -0400 (Sun, 06 Sep 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.5.1-1 (r1213) - * change default -T to 30 - ------------------------------------------------------------------------- -r1209 | lh3 | 2009-09-02 06:06:02 -0400 (Wed, 02 Sep 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.5.1 - ------------------------------------------------------------------------- -r1208 | lh3 | 2009-09-02 05:56:33 -0400 (Wed, 02 Sep 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - - * ChangeLog - ------------------------------------------------------------------------- -r1206 | lh3 | 2009-08-30 18:27:30 -0400 (Sun, 30 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.5.0-6 (r1206) - * fixed two bugs caused by previous modification - ------------------------------------------------------------------------- -r1205 | lh3 | 2009-08-30 17:28:36 -0400 (Sun, 30 Aug 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.5.0-4 (r1205) - * reduce false coordinates and CIGAR when a query bridges two reference - sequences, although some very rare cases may fail bwa. - ------------------------------------------------------------------------- -r1204 | lh3 | 2009-08-30 06:06:16 -0400 (Sun, 30 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.5.0-3 (r1204) - * choose one repetitive hit to extend - ------------------------------------------------------------------------- -r1203 | lh3 | 2009-08-29 18:11:51 -0400 (Sat, 29 Aug 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.5.0-2 (r1203) - * dBWT-SW: change a parameter in calculating mapping quality - * fixed a bug in samse - ------------------------------------------------------------------------- -r1202 | lh3 | 2009-08-28 19:48:41 -0400 (Fri, 28 Aug 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.5.0-1 (r1202) - * change default band width to 50 - * improve mapping quality a bit - ------------------------------------------------------------------------- -r1200 | lh3 | 2009-08-20 06:21:24 -0400 (Thu, 20 Aug 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.5.0 (r1200) - ------------------------------------------------------------------------- -r1199 | lh3 | 2009-08-20 04:49:15 -0400 (Thu, 20 Aug 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwa.1 - -Updated ChangeLog and the manual - ------------------------------------------------------------------------- -r1198 | lh3 | 2009-08-19 11:09:15 -0400 (Wed, 19 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-36 (r1198) - * simplify duphits removal. The accuracy is changed a tiny bit, sometimes better, sometimes worse. - ------------------------------------------------------------------------- -r1197 | lh3 | 2009-08-19 08:15:05 -0400 (Wed, 19 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2_aux.c - A /branches/prog/bwa/bwtsw2_chain.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-35 (r1197) - * further heuristic acceleration for long queries - ------------------------------------------------------------------------- -r1196 | lh3 | 2009-08-18 06:54:03 -0400 (Tue, 18 Aug 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-34 (r1196) - * updated the manual page - * output base quality if the input is fastq - ------------------------------------------------------------------------- -r1195 | lh3 | 2009-08-18 06:23:00 -0400 (Tue, 18 Aug 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/simple_dp.c - - * bwa-0.4.9-33 (r1191) - * fixed a bug in sampe/samse when gaps occur to the 5'-end in SW alignment - * in dbwtsw adjust -T and -c according to -a - ------------------------------------------------------------------------- -r1192 | lh3 | 2009-08-13 05:37:28 -0400 (Thu, 13 Aug 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update manual - ------------------------------------------------------------------------- -r1191 | lh3 | 2009-08-12 19:40:51 -0400 (Wed, 12 Aug 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwtsw2_main.c - -update documentation - ------------------------------------------------------------------------- -r1190 | lh3 | 2009-08-12 08:56:10 -0400 (Wed, 12 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-32 (r1190) - * only help messages are changed - ------------------------------------------------------------------------- -r1189 | lh3 | 2009-08-11 09:28:55 -0400 (Tue, 11 Aug 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-31 (r1189) - * in bwape/bwase, print CIGAR "*" if the read is unmapped - * improved the calculation of mapping quality - ------------------------------------------------------------------------- -r1181 | lh3 | 2009-08-03 12:09:41 -0400 (Mon, 03 Aug 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - -fflush() - ------------------------------------------------------------------------- -r1180 | lh3 | 2009-08-03 12:08:46 -0400 (Mon, 03 Aug 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-30 (r1180) - * fixed a memory problem - * multi-threading sometimes does not work... - ------------------------------------------------------------------------- -r1179 | lh3 | 2009-08-03 11:04:39 -0400 (Mon, 03 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-29 (r1179) - * preliminary mutli-threading support in dbwtsw - ------------------------------------------------------------------------- -r1178 | lh3 | 2009-08-03 09:14:54 -0400 (Mon, 03 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-28 (r1178) - * fixed a bug in printing repetitive hits - ------------------------------------------------------------------------- -r1177 | lh3 | 2009-08-03 05:03:42 -0400 (Mon, 03 Aug 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-27 (r1177) - * bwtsw2: fixed a hidden memory leak - ------------------------------------------------------------------------- -r1176 | lh3 | 2009-07-31 10:58:24 -0400 (Fri, 31 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-26 - * change the way mapping quality is calculated - ------------------------------------------------------------------------- -r1175 | lh3 | 2009-07-31 09:15:54 -0400 (Fri, 31 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-25 - * code clean up - * automatically adjust ->t and ->is_rev based on input - ------------------------------------------------------------------------- -r1174 | lh3 | 2009-07-30 08:50:25 -0400 (Thu, 30 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-24 - * fixed a bug in printing the hits - ------------------------------------------------------------------------- -r1173 | lh3 | 2009-07-29 18:32:43 -0400 (Wed, 29 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-23 - * allow to skip reverse alignment - * increase opt->t to 37 - ------------------------------------------------------------------------- -r1172 | lh3 | 2009-07-29 17:22:39 -0400 (Wed, 29 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-22 - * report if the hit is found in both directions - ------------------------------------------------------------------------- -r1171 | lh3 | 2009-07-29 17:12:02 -0400 (Wed, 29 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-21 - * dbwtsw: map to both forward and reverse BWT to reduce false alignment - ------------------------------------------------------------------------- -r1170 | lh3 | 2009-07-29 15:25:14 -0400 (Wed, 29 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - -save hits before cut_tail() - ------------------------------------------------------------------------- -r1169 | lh3 | 2009-07-29 08:06:01 -0400 (Wed, 29 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.4.9-19 - * use a global memory pool to reduce the CPU time spent on malloc/free(). - ------------------------------------------------------------------------- -r1168 | lh3 | 2009-07-29 06:13:29 -0400 (Wed, 29 Jul 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-18 - * reduce unnecessary extension to the 5'-end - * allow to use different interval size for the 2 rounds - * change default parameters - ------------------------------------------------------------------------- -r1167 | lh3 | 2009-07-28 19:06:17 -0400 (Tue, 28 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-17 - * dbwtsw: fixed THE memory leak. - ------------------------------------------------------------------------- -r1166 | lh3 | 2009-07-28 16:31:41 -0400 (Tue, 28 Jul 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - - * bwa-0.4.9-16 - * fixed a memory leak - * a small memory leak still occurs to bwtsw2_core(). I will work on that later. - * changed the default parameters - ------------------------------------------------------------------------- -r1165 | lh3 | 2009-07-28 10:15:40 -0400 (Tue, 28 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - - * bwa-0.4.9-15 - * generate CIGAR right before output. This saves unnecessary computation. - * this version may be buggy as I have not tested it. - ------------------------------------------------------------------------- -r1164 | lh3 | 2009-07-28 09:04:14 -0400 (Tue, 28 Jul 2009) | 11 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.4.9-14 - - * deplete unique hits in dbwtsw and postprocess them with standard sw - - * in principle, this stratgy should be faster and more accurate, but I - have not tested this point. I may switch back to the old method if - this does not work. - - * the code looks quite nasty now. it needs clean up... - - ------------------------------------------------------------------------- -r1163 | lh3 | 2009-07-27 17:41:10 -0400 (Mon, 27 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - -change a default parameter - ------------------------------------------------------------------------- -r1162 | lh3 | 2009-07-27 17:04:35 -0400 (Mon, 27 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-13 - * dbwtsw: switch between small and large Z-best - ------------------------------------------------------------------------- -r1161 | lh3 | 2009-07-27 12:17:41 -0400 (Mon, 27 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-12 - * changed the default -z to 100 - * heuristically speed up alignments for polyA reads - ------------------------------------------------------------------------- -r1160 | lh3 | 2009-07-27 07:50:57 -0400 (Mon, 27 Jul 2009) | 6 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-11 - - * dbwtsw potentially generates less false alignments, although in - practice, the modification brings no improvement. - - ------------------------------------------------------------------------- -r1159 | lh3 | 2009-07-27 04:37:02 -0400 (Mon, 27 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-10 - * disabled debugging code - * add "BAM_FMU" if both ends are unmapped - ------------------------------------------------------------------------- -r1158 | lh3 | 2009-07-24 09:36:52 -0400 (Fri, 24 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/main.c - -nothing, really - ------------------------------------------------------------------------- -r1157 | lh3 | 2009-07-24 09:05:44 -0400 (Fri, 24 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-9 - * bwtsw2: generate SAM output - ------------------------------------------------------------------------- -r1156 | lh3 | 2009-07-24 05:42:47 -0400 (Fri, 24 Jul 2009) | 6 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-8 - - * fixed a weird deadloop which only happens to icc -O3. Thanks John - Marshall for the fix. - - ------------------------------------------------------------------------- -r1155 | lh3 | 2009-07-24 05:28:40 -0400 (Fri, 24 Jul 2009) | 8 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-7 - - * fixed a typo in bwtsw2 alignment. Now score from the standard SW - seems to agree with score from bwtsw2, except that in reporting - alignments, bwtsw2 may report non-optimal segments. This is expected, - though. I will improve in future. - - ------------------------------------------------------------------------- -r1154 | lh3 | 2009-07-23 17:40:20 -0400 (Thu, 23 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * aln_left_core() seems to work properly - * aln_local_core() has a bug... AN EVER EXISTING BUG!!!!!!!!!!! - ------------------------------------------------------------------------- -r1153 | lh3 | 2009-07-23 17:06:09 -0400 (Thu, 23 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/stdaln.c - -removed debugging code... - ------------------------------------------------------------------------- -r1152 | lh3 | 2009-07-23 17:01:00 -0400 (Thu, 23 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/stdaln.c - - * radical changes failed... - * fixed a bug - ------------------------------------------------------------------------- -r1151 | lh3 | 2009-07-23 14:46:35 -0400 (Thu, 23 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/stdaln.c - -temporary changes. Will apply some radical changes to this file... - ------------------------------------------------------------------------- -r1150 | lh3 | 2009-07-23 10:09:56 -0400 (Thu, 23 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/stdaln.c - -fixed a long-existing bug in Smith-Waterman alignment - ------------------------------------------------------------------------- -r1149 | lh3 | 2009-07-23 08:50:52 -0400 (Thu, 23 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/simple_dp.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.4.9-6 - * unexplained inconsistency still occurs, but the results largely look reasonable. - ------------------------------------------------------------------------- -r1148 | lh3 | 2009-07-23 08:07:29 -0400 (Thu, 23 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/stdaln.c - -half DP - ------------------------------------------------------------------------- -r1147 | lh3 | 2009-07-22 08:03:06 -0400 (Wed, 22 Jul 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - -a bit code clean up - ------------------------------------------------------------------------- -r1145 | lh3 | 2009-07-21 15:52:05 -0400 (Tue, 21 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-5 - * fixed a bug in determining sub-optimal hits - * removed some debugging codes - ------------------------------------------------------------------------- -r1144 | lh3 | 2009-07-21 10:17:29 -0400 (Tue, 21 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-4 - * better cmd interface - * faster speed - ------------------------------------------------------------------------- -r1143 | lh3 | 2009-07-20 16:38:18 -0400 (Mon, 20 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - -bwtsw2 (dBWT-SW) is working apparently... - - ------------------------------------------------------------------------- -r1139 | lh3 | 2009-07-15 05:52:18 -0400 (Wed, 15 Jul 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.4.9-2 - * bwtsw2: change cut_tail() such that it is faster but more likely to - miss true hits - ------------------------------------------------------------------------- -r1138 | lh3 | 2009-07-15 05:18:42 -0400 (Wed, 15 Jul 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - A /branches/prog/bwa/bwt_lite.c - A /branches/prog/bwa/bwt_lite.h - A /branches/prog/bwa/bwtsw2.h - A /branches/prog/bwa/bwtsw2_aux.c - A /branches/prog/bwa/bwtsw2_core.c - A /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * bwa-0.4.9-1 - * added back bwtsw2 - ------------------------------------------------------------------------- -r1075 | lh3 | 2009-05-19 05:14:50 -0400 (Tue, 19 May 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - -Release bwa-0.4.9 - ------------------------------------------------------------------------- -r1073 | lh3 | 2009-05-18 17:13:19 -0400 (Mon, 18 May 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.4.8 - ------------------------------------------------------------------------- -r1069 | lh3 | 2009-05-14 09:54:54 -0400 (Thu, 14 May 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.4.7-2 - * change the default of "aln -R" to 30 - ------------------------------------------------------------------------- -r1068 | lh3 | 2009-05-14 09:27:55 -0400 (Thu, 14 May 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.4.7-1 - * search for suboptimal hits if the top hit is not so repetitive - ------------------------------------------------------------------------- -r1066 | lh3 | 2009-05-12 15:31:31 -0400 (Tue, 12 May 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - -Release bwa-0.4.7 - ------------------------------------------------------------------------- -r1065 | lh3 | 2009-05-12 15:20:40 -0400 (Tue, 12 May 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.6-9 - * fixed compiling errors on some Linux machines - ------------------------------------------------------------------------- -r1064 | lh3 | 2009-05-12 07:30:46 -0400 (Tue, 12 May 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.4.6-8 - * avoid compilation error on some systems. - ------------------------------------------------------------------------- -r1035 | lh3 | 2009-05-09 05:41:33 -0400 (Sat, 09 May 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.6-7 - * fixed an integer overflow caused by previous modifications - * made insert size estimation more robust - ------------------------------------------------------------------------- -r1008 | lh3 | 2009-04-29 05:41:58 -0400 (Wed, 29 Apr 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.6-5 - * fixed a integer overflow problem which may cause seg fault in very rare cases - * made XN tags more accurate - ------------------------------------------------------------------------- -r1005 | lh3 | 2009-04-27 07:37:23 -0400 (Mon, 27 Apr 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/simple_dp.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.4.6-4 - * heuristic rules to detect suboptimal alignment - * stdsw: support double-strand and protein alignment - ------------------------------------------------------------------------- -r1003 | lh3 | 2009-04-26 12:48:19 -0400 (Sun, 26 Apr 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/main.c - M /branches/prog/bwa/simple_dp.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.4.6-2 - * improve the functionality of stdsw - * allow to add a threshold on SW alignment. Hope this does not incur new bugs... - ------------------------------------------------------------------------- -r1002 | lh3 | 2009-04-22 03:56:15 -0400 (Wed, 22 Apr 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.4.6-1 - * output SM and AM tag - ------------------------------------------------------------------------- -r914 | lh3 | 2009-03-09 17:53:50 -0400 (Mon, 09 Mar 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.4.6 - ------------------------------------------------------------------------- -r913 | lh3 | 2009-03-09 17:23:24 -0400 (Mon, 09 Mar 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - A /branches/prog/bwa/solid2fastq.pl - - * added notes to bwa - * added a script to convert SOLiD reads - * updated documentations - ------------------------------------------------------------------------- -r912 | lh3 | 2009-03-09 16:57:05 -0400 (Mon, 09 Mar 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/kstring.c - M /branches/prog/bwa/main.c - -fixed a bug in kstring - ------------------------------------------------------------------------- -r881 | lh3 | 2009-03-02 15:36:06 -0500 (Mon, 02 Mar 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtmisc.c - M /branches/prog/bwa/main.c - - * bwa-0.4.5-7 - * fixed a bug in pac2cspac - ------------------------------------------------------------------------- -r880 | lh3 | 2009-03-01 16:34:08 -0500 (Sun, 01 Mar 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - -disable debugging - ------------------------------------------------------------------------- -r879 | lh3 | 2009-03-01 16:28:04 -0500 (Sun, 01 Mar 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/cs2nt.c - M /branches/prog/bwa/main.c - - * bwa-0.4.5-6 - * fixed problems with coordinates for color gapped alignment - ------------------------------------------------------------------------- -r878 | lh3 | 2009-03-01 13:43:09 -0500 (Sun, 01 Mar 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/cs2nt.c - M /branches/prog/bwa/main.c - - * bwa-0.4.5-5 - * added support for gapped color alignment - ------------------------------------------------------------------------- -r877 | lh3 | 2009-03-01 10:27:52 -0500 (Sun, 01 Mar 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/cs2nt.c - M /branches/prog/bwa/main.c - - * convert cs read to nt read (for ungapped alignment only) - ------------------------------------------------------------------------- -r860 | lh3 | 2009-02-27 08:58:39 -0500 (Fri, 27 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwase.c - A /branches/prog/bwa/cs2nt.c - -prepare to implement cs->nt conversion (have not yet...) - ------------------------------------------------------------------------- -r859 | lh3 | 2009-02-27 07:00:03 -0500 (Fri, 27 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bntseq.h - M /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/bwtmisc.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * bwa-0.4.5-3 - * generate color index from nucleotide fasta reference - ------------------------------------------------------------------------- -r857 | lh3 | 2009-02-26 10:22:58 -0500 (Thu, 26 Feb 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.4.5-2 - * improved mapping quality a bit if one end falls in a tandem repeat - but the mate is unique. - ------------------------------------------------------------------------- -r856 | lh3 | 2009-02-26 10:02:29 -0500 (Thu, 26 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.4.5-1 - * make bwa work for SOLiD reads - ------------------------------------------------------------------------- -r828 | lh3 | 2009-02-18 17:36:41 -0500 (Wed, 18 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.4.5 - ------------------------------------------------------------------------- -r827 | lh3 | 2009-02-18 16:48:48 -0500 (Wed, 18 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/main.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.4.4-6 - * fixed a bug in SW alignment when no residue matches - ------------------------------------------------------------------------- -r824 | lh3 | 2009-02-17 05:33:07 -0500 (Tue, 17 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.4.4-5 - * fixed that bounary bug - ------------------------------------------------------------------------- -r823 | lh3 | 2009-02-17 04:54:18 -0500 (Tue, 17 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwape.c - -just change some logging information - ------------------------------------------------------------------------- -r822 | lh3 | 2009-02-17 04:20:39 -0500 (Tue, 17 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update manual - ------------------------------------------------------------------------- -r821 | lh3 | 2009-02-17 04:11:14 -0500 (Tue, 17 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.4-4 - * fixed a bug on boundary check in pair_sw - ------------------------------------------------------------------------- -r820 | lh3 | 2009-02-16 17:43:37 -0500 (Mon, 16 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.4.4-3 - * allow to change mismatch penalty - ------------------------------------------------------------------------- -r819 | lh3 | 2009-02-16 17:40:28 -0500 (Mon, 16 Feb 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.4.4-2 - * remove timer - * allow to change default gapo and gape penalty at the command line - ------------------------------------------------------------------------- -r818 | lh3 | 2009-02-16 09:30:51 -0500 (Mon, 16 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update benchmark - ------------------------------------------------------------------------- -r817 | lh3 | 2009-02-16 08:44:40 -0500 (Mon, 16 Feb 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/kvec.h - M /branches/prog/bwa/main.c - - * bwa-0.4.4-1 - * automatically detect insert size - * use insert size in pairing. This may potentially improve accuracy (untested!) - ------------------------------------------------------------------------- -r814 | lh3 | 2009-02-15 11:10:23 -0500 (Sun, 15 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.4.4 - ------------------------------------------------------------------------- -r813 | lh3 | 2009-02-15 10:22:50 -0500 (Sun, 15 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.3-5 - * impose boundary check in refine_gapped - ------------------------------------------------------------------------- -r811 | lh3 | 2009-02-14 09:46:13 -0500 (Sat, 14 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.3-4 - * change MD tag to match the latest SAM specification - ------------------------------------------------------------------------- -r810 | lh3 | 2009-02-13 04:46:04 -0500 (Fri, 13 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - -update ChangeLog - ------------------------------------------------------------------------- -r799 | lh3 | 2009-02-05 12:01:17 -0500 (Thu, 05 Feb 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - -change MD tag to meet the latest SAM specification - ------------------------------------------------------------------------- -r796 | lh3 | 2009-02-05 08:35:13 -0500 (Thu, 05 Feb 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.3-2 - * fixed a bug on counting 'N' - ------------------------------------------------------------------------- -r795 | lh3 | 2009-02-05 07:41:27 -0500 (Thu, 05 Feb 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.4.3-1 - * fixed potential boundary problems - * update benchmark result - ------------------------------------------------------------------------- -r791 | lh3 | 2009-01-25 05:20:47 -0500 (Sun, 25 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update some numbers - ------------------------------------------------------------------------- -r790 | lh3 | 2009-01-24 15:13:03 -0500 (Sat, 24 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update benchmark - ------------------------------------------------------------------------- -r789 | lh3 | 2009-01-22 10:18:44 -0500 (Thu, 22 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtindex.c - -a warning message for index - ------------------------------------------------------------------------- -r788 | lh3 | 2009-01-22 09:54:06 -0500 (Thu, 22 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/main.c - -forget to change release number - ------------------------------------------------------------------------- -r786 | lh3 | 2009-01-22 06:27:39 -0500 (Thu, 22 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - -Release bwa-0.4.3 - ------------------------------------------------------------------------- -r785 | lh3 | 2009-01-22 06:27:16 -0500 (Thu, 22 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - -Release bwa-0.4.3 - ------------------------------------------------------------------------- -r784 | lh3 | 2009-01-22 06:19:59 -0500 (Thu, 22 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-10 - * update documentation - * fixed a bug on generating MD tags for SW alignment - ------------------------------------------------------------------------- -r782 | lh3 | 2009-01-19 12:08:38 -0500 (Mon, 19 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-9 - * fixed a bug in samse -n... - ------------------------------------------------------------------------- -r781 | lh3 | 2009-01-19 11:26:37 -0500 (Mon, 19 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-8 - * given -N, the previous version would stop if the top hit is a repeat. Now changed. - ------------------------------------------------------------------------- -r780 | lh3 | 2009-01-19 11:20:18 -0500 (Mon, 19 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-7 - * use a bit-wise flag to replace some member variables in the option struct - * allow to switch off the iterative strategy - ------------------------------------------------------------------------- -r779 | lh3 | 2009-01-19 10:45:57 -0500 (Mon, 19 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-6 - * allow to dump multiple hits from samse, in another format, though - ------------------------------------------------------------------------- -r778 | lh3 | 2009-01-19 06:24:29 -0500 (Mon, 19 Jan 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/kseq.h - A /branches/prog/bwa/kstring.c - A /branches/prog/bwa/kstring.h - M /branches/prog/bwa/main.c - M /branches/prog/bwa/simple_dp.c - - * bwa-0.4.2-5 - * update kseq.h to the latest version - * generate MD tag - * print mate coordinate if only one end is unmapped - ------------------------------------------------------------------------- -r775 | lh3 | 2009-01-18 05:40:35 -0500 (Sun, 18 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-4 - * fixed a bug for SAM format - ------------------------------------------------------------------------- -r774 | lh3 | 2009-01-17 13:48:52 -0500 (Sat, 17 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-3 - * change default fnr to 0.04 - * print max_diff for valid fnr - ------------------------------------------------------------------------- -r773 | lh3 | 2009-01-17 05:54:37 -0500 (Sat, 17 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.4.2-2 - * automatically choose max_diff - ------------------------------------------------------------------------- -r772 | lh3 | 2009-01-16 18:16:14 -0500 (Fri, 16 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.4.2-1 - * take N as a mismatch - ------------------------------------------------------------------------- -r768 | lh3 | 2009-01-09 11:57:23 -0500 (Fri, 09 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/main.c - -Release bwa-0.4.2 - ------------------------------------------------------------------------- -r759 | lh3 | 2009-01-07 09:55:43 -0500 (Wed, 07 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - -Release bwa-0.4.1 - ------------------------------------------------------------------------- -r758 | lh3 | 2009-01-07 05:36:06 -0500 (Wed, 07 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.4.0-2 - * make mate_sw fully working - ------------------------------------------------------------------------- -r757 | lh3 | 2009-01-06 18:04:29 -0500 (Tue, 06 Jan 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.4.0-1 - * do SW alignment for unmapped mate. It is working. - * I still need to do some extra work for SW alignment, but it is too late - and I am getting tired... I will do tomorrow. - ------------------------------------------------------------------------- -r755 | lh3 | 2009-01-06 10:23:29 -0500 (Tue, 06 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.4.0 - ------------------------------------------------------------------------- -r754 | lh3 | 2009-01-06 07:45:02 -0500 (Tue, 06 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtgap.h - M /branches/prog/bwa/main.c - - * bwa-0.3.0-12 - * better lock - ------------------------------------------------------------------------- -r753 | lh3 | 2009-01-06 06:17:21 -0500 (Tue, 06 Jan 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-11 - * fixed a small memory leak in bwa_seq_close() - * fixed "uninitialized memory" from bwt_aln1_t - * multithreading for "aln" command - ------------------------------------------------------------------------- -r752 | lh3 | 2009-01-05 17:34:13 -0500 (Mon, 05 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - D /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwt_gen/bwt_gen.c - A /branches/prog/bwa/bwtmisc.c (from /branches/prog/bwa/pac2bwt.c:748) - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - D /branches/prog/bwa/pac2bwt.c - - * bwa-0.3.0-10 - * a little bit code clean up - ------------------------------------------------------------------------- -r751 | lh3 | 2009-01-05 17:19:04 -0500 (Mon, 05 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-9 - * use 64-bit integer to speed up Occ calculate, although just a little bit - ------------------------------------------------------------------------- -r750 | lh3 | 2009-01-05 16:44:26 -0500 (Mon, 05 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-8 - * a little bit code cleanup - ------------------------------------------------------------------------- -r749 | lh3 | 2009-01-05 16:37:28 -0500 (Mon, 05 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-7 - * accelerate Occ calculation - ------------------------------------------------------------------------- -r748 | lh3 | 2009-01-05 16:12:28 -0500 (Mon, 05 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - M /branches/prog/bwa/pac2bwt.c - - * bwa-0.3.0-6 - * put occ table along with bwt to save another cache miss - * this version is already faster than the previous and I can still improve it... - ------------------------------------------------------------------------- -r747 | lh3 | 2009-01-05 10:16:18 -0500 (Mon, 05 Jan 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-5 - * remove occ_major to save a cache miss; however, OCC_INTERVAL has to be - increased to keep the same memory. As a result, the speed is a little - slower in fact. - ------------------------------------------------------------------------- -r746 | lh3 | 2009-01-05 09:50:53 -0500 (Mon, 05 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-4 - * added back optimization codes (it is a pain...) - ------------------------------------------------------------------------- -r745 | lh3 | 2009-01-05 08:23:00 -0500 (Mon, 05 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-3 - * faster bit operations - ------------------------------------------------------------------------- -r744 | lh3 | 2009-01-05 05:58:46 -0500 (Mon, 05 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-2 - * removed optimization codes again... - * use a new method to count the bits - ------------------------------------------------------------------------- -r743 | lh3 | 2009-01-04 17:18:38 -0500 (Sun, 04 Jan 2009) | 5 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.3.0-1 - * added back the optimization codes - * added a new option to aln: max_entries, although this is disabled by default - * updated benchmark - ------------------------------------------------------------------------- -r742 | lh3 | 2009-01-04 07:56:12 -0500 (Sun, 04 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -add URL - ------------------------------------------------------------------------- -r740 | lh3 | 2009-01-04 07:39:43 -0500 (Sun, 04 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.3.0 - ------------------------------------------------------------------------- -r739 | lh3 | 2009-01-04 06:55:06 -0500 (Sun, 04 Jan 2009) | 2 lines -Changed paths: - A /branches/prog/bwa/COPYING - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bntseq.h - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/utils.c - M /branches/prog/bwa/utils.h - -added licensing information - ------------------------------------------------------------------------- -r738 | lh3 | 2009-01-04 06:18:25 -0500 (Sun, 04 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-31 - * better mapping quality - * update benchmark - ------------------------------------------------------------------------- -r737 | lh3 | 2009-01-03 16:00:58 -0500 (Sat, 03 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwa.1 - -update documentation - ------------------------------------------------------------------------- -r736 | lh3 | 2009-01-02 10:26:38 -0500 (Fri, 02 Jan 2009) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update documentation - ------------------------------------------------------------------------- -r735 | lh3 | 2009-01-02 07:10:20 -0500 (Fri, 02 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-30 - * reduce memory a little bit - * update documentation - ------------------------------------------------------------------------- -r734 | lh3 | 2009-01-01 13:45:45 -0500 (Thu, 01 Jan 2009) | 8 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-29 - * sampe: removed -O option; changed default -o to 100000 - * sampe: fixed a bug in calculating paired mapping quality - * aln: added an option to search for suboptimal hits even if the best is a repeat. - This option will make sampe MUCH SLOWER. - * sampe: set isize as zero if mapped to two different chr - * update manual (unfinished) - ------------------------------------------------------------------------- -r733 | lh3 | 2009-01-01 11:01:20 -0500 (Thu, 01 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-28 - * fixed a bug in calculating paired mapping quality - ------------------------------------------------------------------------- -r732 | lh3 | 2009-01-01 09:27:46 -0500 (Thu, 01 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - A /branches/prog/bwa/khash.h (from /branches/prog/sclib/khash/khash.h:675) - M /branches/prog/bwa/main.c - - * bwa-0.2.0-27 - * accelerate sampe by storing visited large intervals - ------------------------------------------------------------------------- -r731 | lh3 | 2009-01-01 06:51:21 -0500 (Thu, 01 Jan 2009) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-26 - * remove the optimation codes - ------------------------------------------------------------------------- -r730 | lh3 | 2009-01-01 06:48:59 -0500 (Thu, 01 Jan 2009) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-25 - * accelerate OCC calculation by ~7%. However, it seems not worth doing - this by complicate the codes. I will change back later. - ------------------------------------------------------------------------- -r729 | lh3 | 2008-12-31 16:43:56 -0500 (Wed, 31 Dec 2008) | 6 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-24 - * change command "sai2sam_pe" to "sampe" - * print usage for sampe command - * in sampe: change default max_occ to 1000 - * fixed a few compiling warnings in bntseq.c - ------------------------------------------------------------------------- -r728 | lh3 | 2008-12-27 07:14:59 -0500 (Sat, 27 Dec 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-22 - * mating information can be printed to SAM - ------------------------------------------------------------------------- -r727 | lh3 | 2008-12-26 18:10:59 -0500 (Fri, 26 Dec 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.2.0-21 - * implement pairing (still UNFINISHED) - * output all reads even if full of N - ------------------------------------------------------------------------- -r726 | lh3 | 2008-12-26 13:31:27 -0500 (Fri, 26 Dec 2008) | 5 lines -Changed paths: - M /branches/prog/bwa/Makefile - A /branches/prog/bwa/bwape.c - M /branches/prog/bwa/bwase.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * bwa-0.2.0-20 - * remove "-t" from aln cmd - * code clean up: move some functions in bwt2fmv.c to other source files - * added sai2sam_pe cmd: *UNFINISHED* - ------------------------------------------------------------------------- -r725 | lh3 | 2008-12-26 07:04:11 -0500 (Fri, 26 Dec 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - A /branches/prog/bwa/bwase.c - A /branches/prog/bwa/bwaseqio.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/kseq.h - A /branches/prog/bwa/ksort.h (from /branches/prog/sclib/ksort/ksort.h:712) - A /branches/prog/bwa/kvec.h (from /branches/prog/sclib/kvec/kvec.h:537) - M /branches/prog/bwa/main.c - - * bwa-0.2.0-19 - * considerable code cleanup; no actual changes - ------------------------------------------------------------------------- -r724 | lh3 | 2008-12-25 11:32:11 -0500 (Thu, 25 Dec 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.2.0-18 - * generate SAM output - ------------------------------------------------------------------------- -r723 | lh3 | 2008-12-25 10:48:31 -0500 (Thu, 25 Dec 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * bwa-0.2.0-17 - * remove bwtsw2 related codes - * separate searching for SA interval from generating alignments - ------------------------------------------------------------------------- -r722 | lh3 | 2008-12-25 08:57:13 -0500 (Thu, 25 Dec 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt2fmv.c - D /branches/prog/bwa/bwt_lite.c - D /branches/prog/bwa/bwt_lite.h - M /branches/prog/bwa/bwtgap.c - D /branches/prog/bwa/bwtsw2.h - D /branches/prog/bwa/bwtsw2_aux.c - D /branches/prog/bwa/bwtsw2_core.c - D /branches/prog/bwa/bwtsw2_main.c - D /branches/prog/bwa/khash.h - D /branches/prog/bwa/ksort.h - D /branches/prog/bwa/kvec.h - M /branches/prog/bwa/main.c - - * added interface to "aln -t" - * remove bwtsw2 related codes - ------------------------------------------------------------------------- -r666 | lh3 | 2008-11-18 18:34:29 -0500 (Tue, 18 Nov 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.2.0-16 - * allow to set max mismatches based on read length, but I do not know - whether this really works - ------------------------------------------------------------------------- -r665 | lh3 | 2008-11-18 08:34:03 -0500 (Tue, 18 Nov 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-15 - * fixed a bug in sequence parser. - ------------------------------------------------------------------------- -r612 | lh3 | 2008-10-28 06:50:53 -0400 (Tue, 28 Oct 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/utils.c - - * bwa-0.2.0-14 - * fixed a bug caused by the change of the FASTA/Q parser - ------------------------------------------------------------------------- -r611 | lh3 | 2008-10-28 06:24:56 -0400 (Tue, 28 Oct 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bntseq.h - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtsw2_core.c - A /branches/prog/bwa/kseq.h - D /branches/prog/bwa/seq.c - D /branches/prog/bwa/seq.h - M /branches/prog/bwa/simple_dp.c - M /branches/prog/bwa/utils.c - M /branches/prog/bwa/utils.h - -replace seq.* with kseq.h - ------------------------------------------------------------------------- -r610 | lh3 | 2008-10-27 13:00:04 -0400 (Mon, 27 Oct 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-13 - * make bwtsw2 output sub-optimal hits. not completed - ------------------------------------------------------------------------- -r609 | lh3 | 2008-10-24 16:52:00 -0400 (Fri, 24 Oct 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/kvec.h - -little... - ------------------------------------------------------------------------- -r532 | lh3 | 2008-09-19 05:28:45 -0400 (Fri, 19 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/khash.h - -improve interface of khash - ------------------------------------------------------------------------- -r531 | lh3 | 2008-09-18 06:52:59 -0400 (Thu, 18 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -improve minor things, which make bwtsw2 slower, but should miss less true hits - ------------------------------------------------------------------------- -r530 | lh3 | 2008-09-17 18:19:26 -0400 (Wed, 17 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - - * fixed a bug in calculating ->D - * enforce band-width checking - ------------------------------------------------------------------------- -r529 | lh3 | 2008-09-17 18:06:49 -0400 (Wed, 17 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -delete a line of code that is never visited - ------------------------------------------------------------------------- -r528 | lh3 | 2008-09-17 17:58:51 -0400 (Wed, 17 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -a bit code clean up - ------------------------------------------------------------------------- -r527 | lh3 | 2008-09-17 10:55:45 -0400 (Wed, 17 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-12 - * max-depth can be set, although it does not help the speed at all - ------------------------------------------------------------------------- -r526 | lh3 | 2008-09-16 17:59:36 -0400 (Tue, 16 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -cut_tail after remove duplicate - ------------------------------------------------------------------------- -r525 | lh3 | 2008-09-16 17:56:11 -0400 (Tue, 16 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/khash.h - M /branches/prog/bwa/main.c - - * bwa-0.2.0-11 - * improved cut_tail() - ------------------------------------------------------------------------- -r524 | lh3 | 2008-09-15 16:53:22 -0400 (Mon, 15 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-10 - * fixed a bug in cut_tail() - ------------------------------------------------------------------------- -r518 | lh3 | 2008-09-15 04:35:59 -0400 (Mon, 15 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -a bit code clean up - ------------------------------------------------------------------------- -r517 | lh3 | 2008-09-14 18:18:11 -0400 (Sun, 14 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -improve speed (<1%) - ------------------------------------------------------------------------- -r516 | lh3 | 2008-09-14 18:08:55 -0400 (Sun, 14 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - - * fixed two potential bugs, although I have not seen their effects - * improve speed a bit (<2%) - ------------------------------------------------------------------------- -r515 | lh3 | 2008-09-14 17:26:49 -0400 (Sun, 14 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - -nothing, really - ------------------------------------------------------------------------- -r514 | lh3 | 2008-09-14 17:10:13 -0400 (Sun, 14 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -disable X-drop, which has to be reimplemented in the current algorithm - ------------------------------------------------------------------------- -r513 | lh3 | 2008-09-14 16:49:42 -0400 (Sun, 14 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt_lite.c - M /branches/prog/bwa/bwt_lite.h - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - - * temporarily disable cut_tail() - * calculate SA in bwt_lite.c - * fixed a bug in reversing the sequence - ------------------------------------------------------------------------- -r512 | lh3 | 2008-09-13 17:35:40 -0400 (Sat, 13 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - A /branches/prog/bwa/ksort.h - -n-best method - ------------------------------------------------------------------------- -r507 | lh3 | 2008-09-13 09:06:54 -0400 (Sat, 13 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2_core.c - -give correct result again - ------------------------------------------------------------------------- -r506 | lh3 | 2008-09-13 08:12:07 -0400 (Sat, 13 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -I think I know the reason. It needs more work... - ------------------------------------------------------------------------- -r505 | lh3 | 2008-09-13 06:20:43 -0400 (Sat, 13 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2_core.c - -fixed another bug, but still have - ------------------------------------------------------------------------- -r504 | lh3 | 2008-09-12 18:13:37 -0400 (Fri, 12 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -fixed another bug - ------------------------------------------------------------------------- -r503 | lh3 | 2008-09-12 17:15:56 -0400 (Fri, 12 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/khash.h - - * do not segfault, but the result is WRONG! - * prepare to remove bsw2_connectivity_check() - ------------------------------------------------------------------------- -r502 | lh3 | 2008-09-12 15:52:41 -0400 (Fri, 12 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/kvec.h - -more revisions - ------------------------------------------------------------------------- -r501 | lh3 | 2008-09-11 18:06:15 -0400 (Thu, 11 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -further simply codes with kvec.h - ------------------------------------------------------------------------- -r500 | lh3 | 2008-09-11 17:42:15 -0400 (Thu, 11 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -part of revisions... have not finished - ------------------------------------------------------------------------- -r499 | lh3 | 2008-09-11 17:24:15 -0400 (Thu, 11 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/khash.h - A /branches/prog/bwa/kvec.h - -prepare for abrupt change - ------------------------------------------------------------------------- -r496 | lh3 | 2008-09-11 10:34:38 -0400 (Thu, 11 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -fixed a bug; now "bwtsw2 -d" is useless - ------------------------------------------------------------------------- -r495 | lh3 | 2008-09-11 09:22:03 -0400 (Thu, 11 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/simple_dp.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - -improve speed a little bit - ------------------------------------------------------------------------- -r494 | lh3 | 2008-09-11 08:28:08 -0400 (Thu, 11 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -remove debug codes - ------------------------------------------------------------------------- -r493 | lh3 | 2008-09-11 07:49:53 -0400 (Thu, 11 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - - * improve the speed a little bit (<5%) - * prepare to remove BSW_DEBUG - ------------------------------------------------------------------------- -r492 | lh3 | 2008-09-11 06:15:56 -0400 (Thu, 11 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-9 - * support reverse strand - * fixed a bug that causes missing hits - ------------------------------------------------------------------------- -r491 | lh3 | 2008-09-11 05:46:16 -0400 (Thu, 11 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-8 - * better progress report - ------------------------------------------------------------------------- -r490 | lh3 | 2008-09-10 17:04:49 -0400 (Wed, 10 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-7 - * avoid some missing hits - * add maximum depth - ------------------------------------------------------------------------- -r489 | lh3 | 2008-09-10 11:51:13 -0400 (Wed, 10 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-6 - * bwtsw2 works although on the forward strand only for now - * better progress information - ------------------------------------------------------------------------- -r488 | lh3 | 2008-09-10 10:21:53 -0400 (Wed, 10 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - - * implement memory pool - * avoid some rehashing - ------------------------------------------------------------------------- -r487 | lh3 | 2008-09-10 09:23:38 -0400 (Wed, 10 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_main.c - - * fixed a memory leak - * prepare to implement mempool - ------------------------------------------------------------------------- -r486 | lh3 | 2008-09-10 09:10:09 -0400 (Wed, 10 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/khash.h - - * add X-dropoff - * remove duplicated results - * switch to simple stack - ------------------------------------------------------------------------- -r485 | lh3 | 2008-09-10 06:31:20 -0400 (Wed, 10 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - - * check whether t-node has been visited - * prepare to remove two-level stack - ------------------------------------------------------------------------- -r484 | lh3 | 2008-09-10 05:00:57 -0400 (Wed, 10 Sep 2008) | 2 lines -Changed paths: - A /branches/prog/bwa/khash.h - -khash library - ------------------------------------------------------------------------- -r483 | lh3 | 2008-09-10 04:22:53 -0400 (Wed, 10 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -add inline - ------------------------------------------------------------------------- -r482 | lh3 | 2008-09-09 16:34:57 -0400 (Tue, 09 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - -improve speed - ------------------------------------------------------------------------- -r481 | lh3 | 2008-09-09 13:13:00 -0400 (Tue, 09 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtsw2_core.c - -Use a 128bit hash table to keep all (tk,tl,qk,ql). This is slow. Just -keep a copy in case I may need this in future. - - ------------------------------------------------------------------------- -r480 | lh3 | 2008-09-09 12:53:32 -0400 (Tue, 09 Sep 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_core.c - - * no principal modification - ------------------------------------------------------------------------- -r479 | lh3 | 2008-09-09 11:01:45 -0400 (Tue, 09 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2_core.c - - * fixed a bug which may cause duplicated matching - * accelerate the speed a bit, although using hash in avoiding duplications - slows the speed down in the end - ------------------------------------------------------------------------- -r474 | lh3 | 2008-09-03 17:22:57 -0400 (Wed, 03 Sep 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtsw2.h - M /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-5 - * indel seems to work on toy example - * add band - ------------------------------------------------------------------------- -r469 | lh3 | 2008-09-01 09:18:45 -0400 (Mon, 01 Sep 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt_lite.c - M /branches/prog/bwa/bwt_lite.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtsw2.h - A /branches/prog/bwa/bwtsw2_aux.c - M /branches/prog/bwa/bwtsw2_core.c - M /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/is.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - M /branches/prog/bwa/simple_dp.c - - * bwa-0.2.0-4 - * updated bwtsw2, which seems to work properly on toy examples - ------------------------------------------------------------------------- -r447 | lh3 | 2008-08-27 10:05:09 -0400 (Wed, 27 Aug 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-3 - * tune for longer gaps, but it does not really work with kilo-bp gaps... - ------------------------------------------------------------------------- -r446 | lh3 | 2008-08-26 13:30:41 -0400 (Tue, 26 Aug 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-2 - * changed the way to extend long deletions. Now use max_del_occ. - ------------------------------------------------------------------------- -r445 | lh3 | 2008-08-26 13:05:58 -0400 (Tue, 26 Aug 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt_lite.c - M /branches/prog/bwa/bwt_lite.h - -updated from bwtsw2_lite - ------------------------------------------------------------------------- -r436 | lh3 | 2008-08-23 12:28:44 -0400 (Sat, 23 Aug 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.h - A /branches/prog/bwa/bwt_lite.c - A /branches/prog/bwa/bwt_lite.h - A /branches/prog/bwa/bwtsw2.h - A /branches/prog/bwa/bwtsw2_core.c - A /branches/prog/bwa/bwtsw2_main.c - M /branches/prog/bwa/main.c - - * bwa-0.2.0-1 - * add bwt_lite: a light-weighted version of bwt (NOT TESTED!) - * add core codes for bwtsw2: NOT TESTED!!! - ------------------------------------------------------------------------- -r427 | lh3 | 2008-08-15 05:38:12 -0400 (Fri, 15 Aug 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - -Release bwa-0.2.0 - ------------------------------------------------------------------------- -r426 | lh3 | 2008-08-14 11:26:19 -0400 (Thu, 14 Aug 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - - * bwa-0.1.6-7 - * change default seed length to 31 - * add incomplete support to color sequences (not tested yet!) - ------------------------------------------------------------------------- -r425 | lh3 | 2008-08-14 06:23:11 -0400 (Thu, 14 Aug 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.1.6-6 - * change default seed length to 33bp - ------------------------------------------------------------------------- -r424 | lh3 | 2008-08-14 05:55:33 -0400 (Thu, 14 Aug 2008) | 6 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.1.6-5 - * fixed a bug that may miss true alignments. this bugs exists in most - early versions. - * fixed a bug that yields wrong coordinates for reads mapped on the forward - strands with gaps. - ------------------------------------------------------------------------- -r423 | lh3 | 2008-08-14 04:07:28 -0400 (Thu, 14 Aug 2008) | 2 lines -Changed paths: - D /branches/prog/bwa/Makefile.div - -useless - ------------------------------------------------------------------------- -r422 | lh3 | 2008-08-13 19:21:14 -0400 (Wed, 13 Aug 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.1.6-4 - * fixed one bug - * there is another one... - ------------------------------------------------------------------------- -r421 | lh3 | 2008-08-13 18:23:33 -0400 (Wed, 13 Aug 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtgap.h - M /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/main.c - - * bwa-0.1.6-3 - * almost there, but not quite right - ------------------------------------------------------------------------- -r419 | lh3 | 2008-08-13 17:27:02 -0400 (Wed, 13 Aug 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtgap.h - M /branches/prog/bwa/main.c - - * improve the seeding method - * prepare to load two BWTs into memory. A BIG change! - ------------------------------------------------------------------------- -r418 | lh3 | 2008-08-13 10:56:54 -0400 (Wed, 13 Aug 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtgap.h - M /branches/prog/bwa/main.c - - * added seeding - * unfinished yet - ------------------------------------------------------------------------- -r413 | lh3 | 2008-08-08 11:48:35 -0400 (Fri, 08 Aug 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.1.6 - ------------------------------------------------------------------------- -r410 | lh3 | 2008-08-06 15:48:22 -0400 (Wed, 06 Aug 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/simple_dp.c - -sw: output alignment score - ------------------------------------------------------------------------- -r407 | lh3 | 2008-08-04 10:01:20 -0400 (Mon, 04 Aug 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - A /branches/prog/bwa/simple_dp.c - M /branches/prog/bwa/stdaln.c - M /branches/prog/bwa/stdaln.h - - * bwa-0.1.5-3 - * added a simple interface to SW/NW alignment - * stdaln-0.9.8 (see header for more details) - ------------------------------------------------------------------------- -r406 | lh3 | 2008-08-01 19:21:59 -0400 (Fri, 01 Aug 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - A /branches/prog/bwa/stdaln.c - A /branches/prog/bwa/stdaln.h - - * bwa-0.1.5-2 - * give accurate gap positions - ------------------------------------------------------------------------- -r405 | lh3 | 2008-08-01 19:06:19 -0400 (Fri, 01 Aug 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - -unfinished, but I am tired... - ------------------------------------------------------------------------- -r401 | lh3 | 2008-07-30 05:59:24 -0400 (Wed, 30 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/main.c - - * bwa-0.1.5-1 - * fixed a potential bug which may produce an alignment in N regions, - although extremely rare. - ------------------------------------------------------------------------- -r399 | lh3 | 2008-07-27 11:41:52 -0400 (Sun, 27 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.1.5 - ------------------------------------------------------------------------- -r398 | lh3 | 2008-07-25 12:14:47 -0400 (Fri, 25 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update documentation - ------------------------------------------------------------------------- -r397 | lh3 | 2008-07-25 09:58:56 -0400 (Fri, 25 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * - ------------------------------------------------------------------------- -r396 | lh3 | 2008-07-25 06:42:01 -0400 (Fri, 25 Jul 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.1.4-4 - * add timer for debugging - ------------------------------------------------------------------------- -r395 | lh3 | 2008-07-24 05:46:21 -0400 (Thu, 24 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.1.4-3 - * fixed a bug in the previous code - * this version gives identical result to bwa-0.1.4, just 10% faster - ------------------------------------------------------------------------- -r394 | lh3 | 2008-07-24 05:18:53 -0400 (Thu, 24 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtgap.h - M /branches/prog/bwa/main.c - - * bwa-0.1.4-2 - * further improve the speed - * The result is slightly different from bwa-0.1.4 now. I need to check... - ------------------------------------------------------------------------- -r393 | lh3 | 2008-07-23 12:04:16 -0400 (Wed, 23 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt.c - -comments only - ------------------------------------------------------------------------- -r392 | lh3 | 2008-07-23 10:34:03 -0400 (Wed, 23 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - -further improve the speed in Occ functions - ------------------------------------------------------------------------- -r386 | lh3 | 2008-07-22 10:03:54 -0400 (Tue, 22 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/main.c - -Release bwa-0.1.4 - ------------------------------------------------------------------------- -r385 | lh3 | 2008-07-22 09:44:50 -0400 (Tue, 22 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwa.1 - -update documentation and ChangeLog - ------------------------------------------------------------------------- -r384 | lh3 | 2008-07-22 08:50:03 -0400 (Tue, 22 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.1.3-2 - * fixed the bug in the last modification - * now the alignment should be more clearly defined - ------------------------------------------------------------------------- -r383 | lh3 | 2008-07-21 18:32:21 -0400 (Mon, 21 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.1.3-1 - * this is a buggy verion! - * i will fix the bug tomorrow. It is late... - ------------------------------------------------------------------------- -r381 | lh3 | 2008-07-21 06:45:32 -0400 (Mon, 21 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.1.3 - ------------------------------------------------------------------------- -r380 | lh3 | 2008-07-21 06:07:43 -0400 (Mon, 21 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/main.c - - * bwa-0.1.2-3 - * improve the speed for gcc on Intel Mac OS X, but not really on icc on Linux - * aln: more command-line options - ------------------------------------------------------------------------- -r373 | lh3 | 2008-07-17 09:09:46 -0400 (Thu, 17 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/main.c - - * bwa-0.1.2-2 - * further improve the speed - * this version gives exactly the same result as bwa-0.1.2 - ------------------------------------------------------------------------- -r372 | lh3 | 2008-07-17 07:51:08 -0400 (Thu, 17 Jul 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/main.c - - * bwa-0.1.2-1 - * speed up by about 5% - ------------------------------------------------------------------------- -r370 | lh3 | 2008-07-17 05:12:00 -0400 (Thu, 17 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.1.2 - ------------------------------------------------------------------------- -r368 | lh3 | 2008-07-16 08:51:25 -0400 (Wed, 16 Jul 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - D /branches/prog/bwa/bwt1away.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtgap.h - D /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.1-9 - * some code cleanup - * remove 1away and top2 - ------------------------------------------------------------------------- -r367 | lh3 | 2008-07-16 08:24:34 -0400 (Wed, 16 Jul 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/is.c - -Yuta Mori's implementation of IS algorithm. - ------------------------------------------------------------------------- -r365 | lh3 | 2008-07-16 06:58:04 -0400 (Wed, 16 Jul 2008) | 6 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwtgap.h - M /branches/prog/bwa/main.c - - * bwa-0.1.1-8 - * improve gapped alignment - * this version will miss more gapped alignments, but the speed is much faster - * prepare to remove top2 and 1away algorithms - * prepare to add SAIS algorithm for bwt construction - ------------------------------------------------------------------------- -r358 | lh3 | 2008-06-09 06:03:04 -0400 (Mon, 09 Jun 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.1.1-7 - * change END_SKIP from 3 to 5, but still gaps may be wrongly added - * change default '-g' from 5 to 3 - ------------------------------------------------------------------------- -r357 | lh3 | 2008-06-09 05:18:36 -0400 (Mon, 09 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.1.1-6 - * fix a bug in nested stack - ------------------------------------------------------------------------- -r356 | lh3 | 2008-06-08 18:43:13 -0400 (Sun, 08 Jun 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - A /branches/prog/bwa/bwtgap.h - M /branches/prog/bwa/main.c - - * bwa-0.1.1-5 - * replace heap with nested stacks - * there are still obvious bugs... - ------------------------------------------------------------------------- -r355 | lh3 | 2008-06-08 17:13:44 -0400 (Sun, 08 Jun 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * bwa-0.1.1-4 - * add interface to affine gap alignment - * there are obvious bugs and I will fix them later - ------------------------------------------------------------------------- -r354 | lh3 | 2008-06-08 15:39:05 -0400 (Sun, 08 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/main.c - - * bwa-0.1.1-3 - * affine gap seems to work, at least partially - ------------------------------------------------------------------------- -r353 | lh3 | 2008-06-08 09:27:18 -0400 (Sun, 08 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - A /branches/prog/bwa/bwtgap.c - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.1-2 - * initial gapped alignment. not work at the moment - ------------------------------------------------------------------------- -r352 | lh3 | 2008-06-06 04:37:34 -0400 (Fri, 06 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.1-1 - * ungap: remove a useless varible in top2_entry_t - ------------------------------------------------------------------------- -r348 | lh3 | 2008-06-03 09:04:12 -0400 (Tue, 03 Jun 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/ChangeLog - A /branches/prog/bwa/NEWS - M /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/main.c - -Release bwa-0.1.1 - ------------------------------------------------------------------------- -r347 | lh3 | 2008-06-03 05:45:08 -0400 (Tue, 03 Jun 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwa.1 - -update documentation - ------------------------------------------------------------------------- -r346 | lh3 | 2008-06-02 18:59:50 -0400 (Mon, 02 Jun 2008) | 5 lines -Changed paths: - A /branches/prog/bwa/ChangeLog - A /branches/prog/bwa/bwa.1 - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-11 - * improve approximating mapping qualities - * add documentation - * add ChangeLog - ------------------------------------------------------------------------- -r345 | lh3 | 2008-06-02 16:04:39 -0400 (Mon, 02 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-10 - * output a random position for repetitive reads - ------------------------------------------------------------------------- -r344 | lh3 | 2008-06-02 15:03:54 -0400 (Mon, 02 Jun 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/pac2bwt.c - - * bwa-0.1.0-9 - * fix memory leaks - * fix a potential bug in coverting to the real coordinate - ------------------------------------------------------------------------- -r343 | lh3 | 2008-06-02 13:44:51 -0400 (Mon, 02 Jun 2008) | 5 lines -Changed paths: - M /branches/prog/bwa/Makefile.div - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-8 - * fix a bug about strand - * update Makefile.div - * change top2b as the default method - ------------------------------------------------------------------------- -r342 | lh3 | 2008-06-02 11:23:26 -0400 (Mon, 02 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt1away.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-7 - * use bwt_2occ() and bwt_2occ4() in other functions - ------------------------------------------------------------------------- -r341 | lh3 | 2008-06-02 09:31:39 -0400 (Mon, 02 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-6 - * fix a bug for missing hits - ------------------------------------------------------------------------- -r340 | lh3 | 2008-06-02 09:10:18 -0400 (Mon, 02 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-5 - * accelerate comparisons in heap, a bit - ------------------------------------------------------------------------- -r339 | lh3 | 2008-06-02 08:41:31 -0400 (Mon, 02 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-4 - * avoid marginal repeated calculation in occ - ------------------------------------------------------------------------- -r338 | lh3 | 2008-06-02 06:46:51 -0400 (Mon, 02 Jun 2008) | 5 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-3 - * fix a bug caused by previours change - * fix a bug in heap - * order the heap by more criteria - ------------------------------------------------------------------------- -r337 | lh3 | 2008-06-01 19:11:15 -0400 (Sun, 01 Jun 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - - * bwa-0.1.0-2 - * also sort sa range in heapsort, in attempt to improve cache performance. - Unfortunately, it does not work well at all. - ------------------------------------------------------------------------- -r336 | lh3 | 2008-06-01 17:45:23 -0400 (Sun, 01 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/Makefile.div - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/main.c - - * 0.1.0-1 - * fix a bug in calculating the real coordinate - ------------------------------------------------------------------------- -r335 | lh3 | 2008-06-01 16:03:09 -0400 (Sun, 01 Jun 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - -nothing, really - ------------------------------------------------------------------------- -r334 | lh3 | 2008-06-01 15:59:13 -0400 (Sun, 01 Jun 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - A /branches/prog/bwa/Makefile.div - M /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/pac2bwt.c - -use IS algorithm by default - ------------------------------------------------------------------------- -r333 | lh3 | 2008-06-01 15:05:15 -0400 (Sun, 01 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/is.c - M /branches/prog/bwa/pac2bwt.c - - * a bit code clean up in is.c - * add IS algorithm for constructing BWT, albeit slower - ------------------------------------------------------------------------- -r332 | lh3 | 2008-06-01 13:23:08 -0400 (Sun, 01 Jun 2008) | 2 lines -Changed paths: - A /branches/prog/bwa/is.c - -IS linear-time algorithm for constructing SA/BWT - ------------------------------------------------------------------------- -r331 | lh3 | 2008-06-01 10:35:26 -0400 (Sun, 01 Jun 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bntseq.c - A /branches/prog/bwa/bwtindex.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * fix a bug in generating .pac - * index in one go - ------------------------------------------------------------------------- -r330 | lh3 | 2008-06-01 09:17:05 -0400 (Sun, 01 Jun 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bntseq.h - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwttop2.c - -real coordinates can be ouput - ------------------------------------------------------------------------- -r329 | lh3 | 2008-05-31 19:21:02 -0400 (Sat, 31 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwttop2.c - -add top2e which is similar to 1away - ------------------------------------------------------------------------- -r328 | lh3 | 2008-05-31 18:46:12 -0400 (Sat, 31 May 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwttop2.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * unified cmd-line interface for ungapped alignment - * add two alternatives to top2 algorithm - ------------------------------------------------------------------------- -r327 | lh3 | 2008-05-31 18:14:46 -0400 (Sat, 31 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - -add cmd-line interface to alntop2 - ------------------------------------------------------------------------- -r326 | lh3 | 2008-05-31 17:59:31 -0400 (Sat, 31 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt1away.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - A /branches/prog/bwa/bwttop2.c - -top2 algorithm seems to work. I need to change interface, though - ------------------------------------------------------------------------- -r325 | lh3 | 2008-05-31 15:11:49 -0400 (Sat, 31 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt1away.c - -change the variable in the structure - ------------------------------------------------------------------------- -r324 | lh3 | 2008-05-31 14:52:13 -0400 (Sat, 31 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt1away.c - -set a slightly better bound on the maximum allowed mismatches - ------------------------------------------------------------------------- -r323 | lh3 | 2008-05-30 18:40:21 -0400 (Fri, 30 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - - * output time statistics - ------------------------------------------------------------------------- -r322 | lh3 | 2008-05-30 17:58:25 -0400 (Fri, 30 May 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - A /branches/prog/bwa/bwt1away.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - - * presumably better way to make use of prefix. But for the moment I do - not know whether it is correct or not. - * a bit code clean up: separate alignment part - ------------------------------------------------------------------------- -r321 | lh3 | 2008-05-30 13:57:43 -0400 (Fri, 30 May 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt_gen/Makefile - M /branches/prog/bwa/bwt_gen/bwt_gen.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - M /branches/prog/bwa/pac2bwt.c - - * a bit code clean up - * put bwt_gen in bwa - ------------------------------------------------------------------------- -r320 | lh3 | 2008-05-30 11:40:11 -0400 (Fri, 30 May 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtio.c - - * improve cmd-line interface - * fix a bug in loading .sa - * change default sa interval to 32 - ------------------------------------------------------------------------- -r319 | lh3 | 2008-05-30 10:31:37 -0400 (Fri, 30 May 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwtaln.c - - * fix memory leak (I know that. Just a bit lazy) - * change to another method to do 1-away alignment - ------------------------------------------------------------------------- -r318 | lh3 | 2008-05-30 09:21:49 -0400 (Fri, 30 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - -best unique match is partially finished - ------------------------------------------------------------------------- -r317 | lh3 | 2008-05-30 06:33:28 -0400 (Fri, 30 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - -remove "ungapped" command and related codes - ------------------------------------------------------------------------- -r316 | lh3 | 2008-05-30 06:05:20 -0400 (Fri, 30 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - -change variable name thick to width - ------------------------------------------------------------------------- -r315 | lh3 | 2008-05-29 19:06:13 -0400 (Thu, 29 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - M /branches/prog/bwa/pac2bwt.c - -revised algorithm for ungapped alignment. the old one can still be used. - ------------------------------------------------------------------------- -r314 | lh3 | 2008-05-29 16:36:11 -0400 (Thu, 29 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwt_gen/bwt_gen.c - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/pac2bwt.c - - * make commands more independent, but ungapped does not work at the moment - ------------------------------------------------------------------------- -r313 | lh3 | 2008-05-29 15:56:14 -0400 (Thu, 29 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt_gen/bwt_gen.c - -little... - ------------------------------------------------------------------------- -r312 | lh3 | 2008-05-29 15:54:01 -0400 (Thu, 29 May 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt_gen/bwt_gen.c - M /branches/prog/bwa/bwt_gen/bwt_gen.h - - * add CopyRight information from the original codes - * do not dump .fmv files - ------------------------------------------------------------------------- -r311 | lh3 | 2008-05-29 15:44:36 -0400 (Thu, 29 May 2008) | 2 lines -Changed paths: - A /branches/prog/bwa/bwt_gen - A /branches/prog/bwa/bwt_gen/Makefile - A /branches/prog/bwa/bwt_gen/QSufSort.c - A /branches/prog/bwa/bwt_gen/QSufSort.h - A /branches/prog/bwa/bwt_gen/bwt_gen.c - A /branches/prog/bwa/bwt_gen/bwt_gen.h - -codes from BWT-SW, for building BWT from packed file - ------------------------------------------------------------------------- -r310 | lh3 | 2008-05-28 17:03:35 -0400 (Wed, 28 May 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * change OCC_INTERVAL to 0x40, which makes bwa twice as fast. - * write Occ file as ".occ" as it is using a different interval from - .fmv, the BWT-SW correspondance of .occ - ------------------------------------------------------------------------- -r309 | lh3 | 2008-05-28 11:39:37 -0400 (Wed, 28 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt2fmv.c - -fix a bug - ------------------------------------------------------------------------- -r308 | lh3 | 2008-05-28 09:56:16 -0400 (Wed, 28 May 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt2fmv.c - -add heuristics to improve the speed, but I have not tested whether the -results are correct or not. - - ------------------------------------------------------------------------- -r307 | lh3 | 2008-05-28 06:31:34 -0400 (Wed, 28 May 2008) | 5 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/bwtaln.c - M /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * make ungapped alignment basically works... - * but it is very slow in comparison to others... - * also I need to improve the interface... - * a lot of things to keep me busy today... - ------------------------------------------------------------------------- -r306 | lh3 | 2008-05-27 18:41:27 -0400 (Tue, 27 May 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwtaln.c - - * remove recursion - * fixed a bug in bwt_occ() - ------------------------------------------------------------------------- -r305 | lh3 | 2008-05-27 16:59:44 -0400 (Tue, 27 May 2008) | 5 lines -Changed paths: - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwtaln.c - - * bwa now tells whether a sequenced can be mapped with maximum allowed - mismatches. ONLY ungapped. - * this is a recursive version. I will remove recursion later. - - ------------------------------------------------------------------------- -r304 | lh3 | 2008-05-27 09:12:17 -0400 (Tue, 27 May 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - A /branches/prog/bwa/bwtaln.c - A /branches/prog/bwa/bwtaln.h - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - M /branches/prog/bwa/utils.c - - * load .sa and .fmv files - * exact alignment now works - ------------------------------------------------------------------------- -r303 | lh3 | 2008-05-27 06:33:38 -0400 (Tue, 27 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/utils.c - M /branches/prog/bwa/utils.h - -add xassert and fix a bug - ------------------------------------------------------------------------- -r302 | lh3 | 2008-05-27 06:23:20 -0400 (Tue, 27 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwtio.c - A /branches/prog/bwa/utils.c - A /branches/prog/bwa/utils.h - -improve error message and error handling - ------------------------------------------------------------------------- -r301 | lh3 | 2008-05-27 05:37:51 -0400 (Tue, 27 May 2008) | 4 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - A /branches/prog/bwa/bwtio.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - - * move I/O codes to bwtio.c - * SA can be dumped and interestingly, it is identical to BWTSW - * now, .fmv is still different from BWTSW - ------------------------------------------------------------------------- -r299 | lh3 | 2008-05-26 18:07:44 -0400 (Mon, 26 May 2008) | 2 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - -generate/retrieve SA and Occ - ------------------------------------------------------------------------- -r298 | lh3 | 2008-05-26 13:16:49 -0400 (Mon, 26 May 2008) | 3 lines -Changed paths: - M /branches/prog/bwa/bntseq.h - M /branches/prog/bwa/bwt.c - M /branches/prog/bwa/bwt.h - M /branches/prog/bwa/bwt2fmv.c - - * retrieve occ value at any position - * move bwt_cal_occ() to bwt.c - ------------------------------------------------------------------------- -r297 | lh3 | 2008-05-25 17:43:58 -0400 (Sun, 25 May 2008) | 6 lines -Changed paths: - M /branches/prog/bwa/Makefile - A /branches/prog/bwa/bwt.c - A /branches/prog/bwa/bwt.h - A /branches/prog/bwa/bwt2fmv.c - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - M /branches/prog/bwa/pac2bwt.c - - * add bwt2fmv. It works to some extend. However, I do not understand - the purpose of some weird codes in BWT-SW. As a consequence, bwt2fmv - could generate a file almost identical, but not exactly identical, to - the .fmv file from BWT-SW. - - ------------------------------------------------------------------------- -r296 | lh3 | 2008-05-24 18:35:02 -0400 (Sat, 24 May 2008) | 5 lines -Changed paths: - M /branches/prog/bwa/Makefile - M /branches/prog/bwa/bntseq.c - M /branches/prog/bwa/bntseq.h - M /branches/prog/bwa/main.c - M /branches/prog/bwa/main.h - A /branches/prog/bwa/pac2bwt.c - -Burrows-Wheeler Transform now works. At least on one example, the -current code generates the same BWT as BWT-SW. Kind of magical, I would -say. :) - - ------------------------------------------------------------------------- -r295 | lh3 | 2008-05-24 11:25:31 -0400 (Sat, 24 May 2008) | 3 lines -Changed paths: - A /branches/prog/bwa/Makefile - M /branches/prog/bwa/bntseq.c - A /branches/prog/bwa/main.c - A /branches/prog/bwa/main.h - - * add Makefile and main.* - * improve interface to fa2bns, a bit - ------------------------------------------------------------------------- -r293 | lh3 | 2008-05-24 10:57:03 -0400 (Sat, 24 May 2008) | 3 lines -Changed paths: - A /branches/prog/bwa - A /branches/prog/bwa/bntseq.c - A /branches/prog/bwa/bntseq.h - A /branches/prog/bwa/seq.c - A /branches/prog/bwa/seq.h - - * Burrow-Wheeler Alignment - * initial codes - ------------------------------------------------------------------------- diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/NEWS.md --- a/bwa-0.7.9a/NEWS.md Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1055 +0,0 @@ -Release 0.7.9 (19 May, 2014) ----------------------------- - -This release brings several major changes to BWA-MEM. Notably, BWA-MEM now -formally supports PacBio read-to-reference alignment and experimentally supports -PacBio read-to-read alignment. BWA-MEM also runs faster at a minor cost of -accuracy. The speedup is more significant when GRCh38 is in use. More -specifically: - - * Support PacBio subread-to-reference alignment. Although older BWA-MEM works - with PacBio data in principle, the resultant alignments are frequently - fragmented. In this release, we fine tuned existing methods and introduced - new heuristics to improve PacBio alignment. These changes are not used by - default. Users need to add option "-x pacbio" to enable the feature. - - * Support PacBio subread-to-subread alignment (EXPERIMENTAL). This feature is - enabled with option "-x pbread". In this mode, the output only gives the - overlapping region between a pair of reads without detailed alignment. - - * Output alternative hits in the XA tag if there are not so many of them. This - is a BWA-backtrack feature. - - * Support mapping to ALT contigs in GRCh38 (EXPERIMENTAL). We provide a script - to postprocess hits in the XA tag to adjust the mapping quality and generate - new primary alignments to all overlapping ALT contigs. We would *NOT* - recommend this feature for production uses. - - * Improved alignments to many short reference sequences. Older BWA-MEM may - generate an alignment bridging two or more adjacent reference sequences. - Such alignments are split at a later step as postprocessing. This approach - is complex and does not always work. This release forbids these alignments - from the very beginning. BWA-MEM should not produce an alignment bridging - two or more reference sequences any more. - - * Reduced the maximum seed occurrence from 10000 to 500. Reduced the maximum - rounds of Smith-Waterman mate rescue from 100 to 50. Added a heuristic to - lower the mapping quality if a read contains seeds with excessive - occurrences. These changes make BWA-MEM faster at a minor cost of accuracy - in highly repetitive regions. - - * Added an option "-Y" to use soft clipping for supplementary alignments. - - * Bugfix: incomplete alignment extension in corner cases. - - * Bugfix: integer overflow when aligning long query sequences. - - * Bugfix: chain score is not computed correctly (almost no practical effect) - - * General code cleanup - - * Added FAQs to README - -Changes in BWA-backtrack: - - * Bugfix: a segmentation fault when an alignment stands out of the end of the - last chromosome. - -(0.7.9: 19 May 2014, r783) - - - -Release 0.7.8 (31 March, 2014) ------------------------------- - -Changes in BWA-MEM: - - * Bugfix: off-diagonal X-dropoff (option -d) not working as intended. - Short-read alignment is not affected. - - * Bugfix: unnecessarily large bandwidth used during global alignment, - which reduces the mapping speed by -5% for short reads. Results are not - affected. - - * Bugfix: when the matching score is not one, paired-end mapping quality is - inaccurate. - - * When the matching score (option -A) is changed, scale all score-related - options accordingly unless overridden by users. - - * Allow to specify different gap open (or extension) penalties for deletions - and insertions separately. - - * Allow to specify the insert size distribution. - - * Better and more detailed debugging information. - -With the default setting, 0.7.8 and 0.7.7 gave identical output on one million -100bp read pairs. - -(0.7.8: 31 March 2014, r455) - - - -Release 0.7.7 (25 Feburary, 2014) ---------------------------------- - -This release fixes incorrect MD tags in the BWA-MEM output. - -A note about short-read mapping to GRCh38. The new human reference genome -GRCh38 contains 60Mbp program generated alpha repeat arrays, some of which are -hard masked as they cannot be localized. These highly repetitive arrays make -BWA-MEM -50% slower. If you are concerned with the performance of BWA-MEM, you -may consider to use option "-c2000 -m50". On simulated data, this setting helps -the performance at a very minor cost on accuracy. I may consider to change the -default in future releases. - -(0.7.7: 25 Feburary 2014, r441) - - - -Release 0.7.6 (31 Januaray, 2014) ---------------------------------- - -Changes in BWA-MEM: - - * Changed the way mapping quality is estimated. The new method tends to give - the same alignment a higher mapping quality. On paired-end reads, the change - is minor as with pairing, the mapping quality is usually high. For short - single-end reads, the difference is considerable. - - * Improved load balance when many threads are spawned. However, bwa-mem is - still not very thread efficient, probably due to the frequent heap memory - allocation. Further improvement is a little difficult and may affect the - code stability. - - * Allow to use different clipping penalties for 5'- and 3'-ends. This helps - when we do not want to clip one end. - - * Print the @PG line, including the command line options. - - * Improved the band width estimate: a) fixed a bug causing the band - width extimated from extension not used in the final global alignment; b) - try doubled band width if the global alignment score is smaller. - Insufficient band width leads to wrong CIGAR and spurious mismatches/indels. - - * Added a new option -D to fine tune a heuristic on dropping suboptimal hits. - Reducing -D increases accuracy but decreases the mapping speed. If unsure, - leave it to the default. - - * Bugfix: for a repetitive single-end read, the reported hit is not randomly - distributed among equally best hits. - - * Bugfix: missing paired-end hits due to unsorted list of SE hits. - - * Bugfix: incorrect CIGAR caused by a defect in the global alignment. - - * Bugfix: incorrect CIGAR caused by failed SW rescue. - - * Bugfix: alignments largely mapped to the same position are regarded to be - distinct from each other, which leads to underestimated mapping quality. - - * Added the MD tag. - -There are no changes to BWA-backtrack in this release. However, it has a few -known issues yet to be fixed. If you prefer BWA-track, It is still advised to -use bwa-0.6.x. - -While I developed BWA-MEM, I also found a few issues with BWA-SW. It is now -possible to improve BWA-SW with the lessons learned from BWA-MEM. However, as -BWA-MEM is usually better, I will not improve BWA-SW until I find applications -where BWA-SW may excel. - -(0.7.6: 31 January 2014, r432) - - - -Release 0.7.5a (30 May, 2013) ------------------------------ - -Fixed a bug in BWA-backtrack which leads to off-by-one mapping errors in rare -cases. - -(0.7.5a: 30 May 2013, r405) - - - -Release 0.7.5 (29 May, 2013) ----------------------------- - -Changes in all components: - - * Improved error checking on memory allocation and file I/O. Patches provided - by Rob Davies. - - * Updated README. - - * Bugfix: return code is zero upon errors. - -Changes in BWA-MEM: - - * Changed the way a chimeric alignment is reported (conforming to the upcoming - SAM spec v1.5). With 0.7.5, if the read has a chimeric alignment, the paired - or the top hit uses soft clipping and is marked with neither 0x800 nor 0x100 - bits. All the other hits part of the chimeric alignment will use hard - clipping and be marked with 0x800 if option "-M" is not in use, or marked - with 0x100 otherwise. - - * Other hits part of a chimeric alignment are now reported in the SA tag, - conforming to the SAM spec v1.5. - - * Better method for resolving an alignment bridging two or more short - reference sequences. The current strategy maps the query to the reference - sequence that covers the middle point of the alignment. For most - applications, this change has no effects. - -Changes in BWA-backtrack: - - * Added a magic number to .sai files. This prevents samse/sampe from reading - corrupted .sai (e.g. a .sai file containing LSF log) or incompatible .sai - generated by a different version of bwa. - - * Bugfix: alignments in the XA:Z: tag were wrong. - - * Keep track of #ins and #del during backtracking. This simplifies the code - and reduces errors in rare corner cases. I should have done this in the - early days of bwa. - -In addition, if you use BWA-MEM or the fastmap command of BWA, please cite: - - - Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs - with BWA-MEM. arXiv:1303.3997v2 [q-bio.GN]. - -Thank you. - -(0.7.5: 29 May 2013, r404) - - - -Release 0.7.4 (23 April, 2013) ------------------------------- - -This is a bugfix release. Most of bugs are considered to be minor which only -occur very rarely. - - * Bugfix: wrong CIGAR when a query sequence bridges three or more target - sequences. This only happens when aligning reads to short assembly contigs. - - * Bugfix: leading "D" operator in CIGAR. - - * Extend more seeds for better alignment around tandem repeats. This is also - a cause of the leading "D" operator in CIGAR. - - * Bugfix: SSE2-SSW may occasionally find incorrect query starting position - around tandem repeat. This will lead to a suboptimal CIGAR in BWA-MEM and - a wrong CIGAR in BWA. - - * Bugfix: clipping penalty does not work as is intended when there is a gap - towards the end of a read. - - * Fixed an issue caused by a bug in the libc from Mac/Darwin. In Darwin, - fread() is unable to read a data block longer than 2GB due to an integer - overflow bug in its implementation. - -Since version 0.7.4, BWA-MEM is considered to reach similar stability to -BWA-backtrack for short-read mapping. - -(0.7.4: 23 April, r385) - - - -Release 0.7.3a (15 March, 2013) -------------------------------- - -In 0.7.3, the wrong CIGAR bug was only fixed in one scenario, but not fixed -in another corner case. - -(0.7.3a: 15 March 2013, r367) - - - -Release 0.7.3 (15 March, 2013) ------------------------------- - -Changes to BWA-MEM: - - * Bugfix: pairing score is inaccurate when option -A does not take the default - value. This is a very minor issue even if it happens. - - * Bugfix: occasionally wrong CIGAR. This happens when in the alignment there - is a 1bp deletion and a 1bp insertion which are close to the end of the - reads, and there are no other substitutions or indels. BWA-MEM would not do - a gapped alignment due to the bug. - - * New feature: output other non-overlapping alignments in the XP tag such that - we can see the entire picture of alignment from one SAM line. XP gives the - position, CIGAR, NM and mapQ of each aligned subsequence of the query. - -BWA-MEM has been used to align -300Gbp 100-700bp SE/PE reads. SNP/indel calling -has also been evaluated on part of these data. BWA-MEM generally gives better -pre-filtered SNP calls than BWA. No significant issues have been observed since -0.7.2, though minor improvements or bugs (e.g. the bug fixed in this release) -are still possible. If you find potential issues, please send bug reports to - (free registration required). - -In addition, more detailed description of the BWA-MEM algorithm can be found at -. - -(0.7.3: 15 March 2013, r366) - - - -Release 0.7.2 (9 March, 2013) ------------------------------ - -Emergent bug fix: 0.7.0 and 0.7.1 give a wrong sign to TLEN. In addition, -flagging 'properly paired' also gets improved a little. - -(0.7.2: 9 March 2013, r351) - - - -Release 0.7.1 (8 March, 2013) ------------------------------ - -Changes to BWA-MEM: - - * Bugfix: rare segmentation fault caused by a partial hit to the end of the - last sequence. - - * Bugfix: occasional mis-pairing given an interleaved fastq. - - * Bugfix: wrong mate information when the mate is unmapped. SAM generated by - BWA-MEM can now be validated with Picard. - - * Improved the performance and accuracy for ultra-long query sequences. - Short-read alignment is not affected. - -Changes to other components: - - * In BWA-backtrack and BWA-SW, replaced the code for global alignment, - Smith-Waterman and SW extension. The performance and accuracy of the two - algorithms stay the same. - - * Added an experimental subcommand to merge overlapping paired ends. The - algorithm is very conservative: it may miss true overlaps but rarely makes - mistakes. - -An important note is that like BWA-SW, BWA-MEM may output multiple primary -alignments for a read, which may cause problems to some tools. For aligning -sequence reads, it is advised to use '-M' to flag extra hits as secondary. This -option is not the default because multiple primary alignments are theoretically -possible in sequence alignment. - -(0.7.1: 8 March 2013, r347) - - - -Beta Release 0.7.0 (28 Feburary, 2013) --------------------------------------- - -This release comes with a new alignment algorithm, BWA-MEM, for 70bp-1Mbp query -sequences. BWA-MEM essentially seeds alignments with a variant of the fastmap -algorithm and extends seeds with banded affine-gap-penalty dynamic programming -(i.e. the Smith-Waterman-Gotoh algorithm). For typical Illumina 100bp reads or -longer low-divergence query sequences, BWA-MEM is about twice as fast as BWA -and BWA-SW and is more accurate. It also supports split alignments like BWA-SW -and may optionally output multiple hits like BWA. BWA-MEM does not guarantee -to find hits within a certain edit distance, but BWA is not efficient for such -task given longer reads anyway, and the edit-distance criterion is arguably -not as important in long-read alignment. - -In addition to the algorithmic improvements, BWA-MEM also implements a few -handy features in practical aspects: - - 1. BWA-MEM automatically switches between local and glocal (global wrt reads; - local wrt reference) alignment. It reports the end-to-end glocal alignment - if the glocal alignment is not much worse than the optimal local alignment. - Glocal alignment reduces reference bias. - - 2. BWA-MEM automatically infers pair orientation from a batch of single-end - alignments. It allows more than one orientations if there are sufficient - supporting reads. This feature has not been tested on reads from Illumina - jumping library yet. (EXPERIMENTAL) - - 3. BWA-MEM optionally takes one interleaved fastq for paired-end mapping. It - is possible to convert a name-sorted BAM to an interleaved fastq on the fly - and feed the data stream to BWA-MEM for mapping. - - 4. BWA-MEM optionally copies FASTA/Q comments to the final SAM output, which - helps to transfer individual read annotations to the output. - - 5. BWA-MEM supports more advanced piping. Users can now run: - (bwa mem ref.fa '20) CPU cores. - - * Check I/O error. - - * Increased the maximum barcode length to 63bp. - - * Automatically choose the indexing algorithm. - - * Bugfix: very rare segfault due to an uninitialized variable. The bug also - affects the placement of suboptimal alignments. The effect is very minor. - -This release involves quite a lot of tricky changes. Although it has been -tested on a few data sets, subtle bugs may be still hidden. It is *NOT* -recommended to use this release in a production pipeline. In future, however, -BWA-SW may be better when reads continue to go longer. I would encourage users -to try the 0.6 release. I would also like to hear the users' experience. Thank -you. - -(0.6.0: 12 November 2011, r85) - - - -Beta Release 0.5.9 (24 January, 2011) -------------------------------------- - -Notable changes: - - * Feature: barcode support via the '-B' option. - - * Feature: Illumina 1.3+ read format support via the '-I' option. - - * Bugfix: RG tags are not attached to unmapped reads. - - * Bugfix: very rare bwasw mismappings - - * Recommend options for PacBio reads in bwasw help message. - - -Also, since January 13, the BWA master repository has been moved to github: - - https://github.com/lh3/bwa - -The revision number has been reset. All recent changes will be first -committed to this repository. - -(0.5.9: 24 January 2011, r16) - - - -Beta Release Candidate 0.5.9rc1 (10 December, 2010) ---------------------------------------------------- - -Notable changes in bwasw: - - * Output unmapped reads. - - * For a repetitive read, choose a random hit instead of a fixed - one. This is not well tested. - -Notable changes in bwa-short: - - * Fixed a bug in the SW scoring system, which may lead to unexpected - gaps towards the end of a read. - - * Fixed a bug which invalidates the randomness of repetitive reads. - - * Fixed a rare memory leak. - - * Allowed to specify the read group at the command line. - - * Take name-grouped BAM files as input. - -Changes to this release are usually safe in that they do not interfere -with the key functionality. However, the release has only been tested on -small samples instead of on large-scale real data. If anything weird -happens, please report the bugs to the bio-bwa-help mailing list. - -(0.5.9rc1: 10 December 2010, r1561) - - - -Beta Release 0.5.8 (8 June, 2010) ---------------------------------- - -Notable changes in bwasw: - - * Fixed an issue of missing alignments. This should happen rarely and - only when the contig/read alignment is multi-part. Very rarely, bwasw - may still miss a segment in a multi-part alignment. This is difficult - to fix, although possible. - -Notable changes in bwa-short: - - * Discard the SW alignment when the best single-end alignment is much - better. Such a SW alignment may caused by structural variations and - forcing it to be aligned leads to false alignment. This fix has not - been tested thoroughly. It would be great to receive more users - feedbacks on this issue. - - * Fixed a typo/bug in sampe which leads to unnecessarily large memory - usage in some cases. - - * Further reduced the chance of reporting 'weird pairing'. - -(0.5.8: 8 June 2010, r1442) - - - -Beta Release 0.5.7 (1 March, 2010) ----------------------------------- - -This release only has an effect on paired-end data with fat insert-size -distribution. Users are still recommended to update as the new release -improves the robustness to poor data. - - * The fix for 'weird pairing' was not working in version 0.5.6, pointed - out by Carol Scott. It should work now. - - * Optionally output to a normal file rather than to stdout (by Tim - Fennel). - -(0.5.7: 1 March 2010, r1310) - - - -Beta Release 0.5.6 (10 Feburary, 2010) --------------------------------------- - -Notable changes in bwa-short: - - * Report multiple hits in the SAM format at a new tag XA encoded as: - (chr,pos,CIGAR,NM;)*. By default, if a paired or single-end read has - 4 or fewer hits, they will all be reported; if a read in a anomalous - pair has 11 or fewer hits, all of them will be reported. - - * Perform Smith-Waterman alignment also for anomalous read pairs when - both ends have quality higher than 17. This reduces false positives - for some SV discovery algorithms. - - * Do not report "weird pairing" when the insert size distribution is - too fat or has a mean close to zero. - - * If a read is bridging two adjacent chromsomes, flag it as unmapped. - - * Fixed a small but long existing memory leak in paired-end mapping. - - * Multiple bug fixes in SOLiD mapping: a) quality "-1" can be correctly - parsed by solid2fastq.pl; b) truncated quality string is resolved; c) - SOLiD read mapped to the reverse strand is complemented. - - * Bwa now calculates skewness and kurtosis of the insert size - distribution. - - * Deploy a Bayesian method to estimate the maximum distance for a read - pair considered to be paired properly. The method is proposed by - Gerton Lunter, but bwa only implements a simplified version. - - * Export more functions for Java bindings, by Matt Hanna (See: - http://www.broadinstitute.org/gsa/wiki/index.php/Sting_BWA/C_bindings) - - * Abstract bwa CIGAR for further extension, by Rodrigo Goya. - -(0.5.6: 10 Feburary 2010, r1303) - - - -Beta Release 0.5.5 (10 November, 2009) --------------------------------------- - -This is a bug fix release: - - * Fixed a serious bug/typo in aln which does not occur given short - reads, but will lead to segfault for >500bp reads. Of course, the aln - command is not recommended for reads longer than 200bp, but this is a - bug anyway. - - * Fixed a minor bug/typo which leads to incorrect single-end mapping - quality when one end is moved to meet the mate-pair requirement. - - * Fixed a bug in samse for mapping in the color space. This bug is - caused by quality filtration added since 0.5.1. - -(0.5.5: 10 November 2009, r1273) - - - -Beta Release 0.5.4 (9 October, 2009) ------------------------------------- - -Since this version, the default seed length used in the "aln" command is -changed to 32. - -Notable changes in bwa-short: - - * Added a new tag "XC:i" which gives the length of clipped reads. - - * In sampe, skip alignments in case of a bug in the Smith-Waterman - alignment module. - - * In sampe, fixed a bug in pairing when the read sequence is identical - to its reverse complement. - - * In sampe, optionally preload the entire FM-index into memory to - reduce disk operations. - -Notable changes in dBWT-SW/BWA-SW: - - * Changed name dBWT-SW to BWA-SW. - - * Optionally use "hard clipping" in the SAM output. - -(0.5.4: 9 October 2009, r1245) - - - -Beta Release 0.5.3 (15 September, 2009) ---------------------------------------- - -Fixed a critical bug in bwa-short: reads mapped to the reverse strand -are not complemented. - -(0.5.3: 15 September 2009, r1225) - - - -Beta Release 0.5.2 (13 September, 2009) ---------------------------------------- - -Notable changes in bwa-short: - - * Optionally trim reads before alignment. See the manual page on 'aln - -q' for detailed description. - - * Fixed a bug in calculating the NM tag for a gapped alignment. - - * Fixed a bug given a mixture of reads with some longer than the seed - length and some shorter. - - * Print SAM header. - -Notable changes in dBWT-SW: - - * Changed the default value of -T to 30. As a result, the accuracy is a - little higher for short reads at the cost of speed. - -(0.5.2: 13 September 2009, r1223) - - - -Beta Release 0.5.1 (2 September, 2009) --------------------------------------- - -Notable changes in the short read alignment component: - - * Fixed a bug in samse: do not write mate coordinates. - -Notable changes in dBWT-SW: - - * Randomly choose one alignment if the read is a repetitive. - - * Fixed a flaw when a read is mapped across two adjacent reference - sequences. However, wrong alignment reports may still occur rarely in - this case. - - * Changed the default band width to 50. The speed is slower due to this - change. - - * Improved the mapping quality a little given long query sequences. - -(0.5.1: 2 September 2009, r1209) - - - -Beta Release 0.5.0 (20 August, 2009) ------------------------------------- - -This release implements a novel algorithm, dBWT-SW, specifically -designed for long reads. It is 10-50 times faster than SSAHA2, depending -on the characteristics of the input data, and achieves comparable -alignment accuracy while allowing chimera detection. In comparison to -BLAT, dBWT-SW is several times faster and much more accurate especially -when the error rate is high. Please read the manual page for more -information. - -The dBWT-SW algorithm is kind of developed for future sequencing -technologies which produce much longer reads with a little higher error -rate. It is still at its early development stage. Some features are -missing and it may be buggy although I have evaluated on several -simulated and real data sets. But following the "release early" -paradigm, I would like the users to try it first. - -Other notable changes in BWA are: - - * Fixed a rare bug in the Smith-Waterman alignment module. - - * Fixed a rare bug about the wrong alignment coordinate when a read is - poorly aligned. - - * Fixed a bug in generating the "mate-unmap" SAM tag when both ends in - a pair are unmapped. - -(0.5.0: 20 August 2009, r1200) - - - -Beta Release 0.4.9 (19 May, 2009) ---------------------------------- - -Interestingly, the integer overflow bug claimed to be fixed in 0.4.7 has -not in fact. Now I have fixed the bug. Sorry for this and thank Quan -Long for pointing out the bug (again). - -(0.4.9: 19 May 2009, r1075) - - - -Beta Release 0.4.8 (18 May, 2009) ---------------------------------- - -One change to "aln -R". Now by default, if there are no more than '-R' -equally best hits, bwa will search for suboptimal hits. This change -affects the ability in finding SNPs in segmental duplications. - -I have not tested this option thoroughly, but this simple change is less -likely to cause new bugs. Hope I am right. - -(0.4.8: 18 May 2009, r1073) - - - -Beta Release 0.4.7 (12 May, 2009) ---------------------------------- - -Notable changes: - - * Output SM (single-end mapping quality) and AM (smaller mapping - quality among the two ends) tag from sam output. - - * Improved the functionality of stdsw. - - * Made the XN tag more accurate. - - * Fixed a very rare segfault caused by integer overflow. - - * Improve the insert size estimation. - - * Fixed compiling errors for some Linux systems. - -(0.4.7: 12 May 2009, r1066) - - - -Beta Release 0.4.6 (9 March, 2009) ----------------------------------- - -This release improves the SOLiD support. First, a script for converting -SOLiD raw data is provided. This script is adapted from solid2fastq.pl -in the MAQ package. Second, a nucleotide reference file can be directly -used with 'bwa index'. Third, SOLiD paired-end support is -completed. Fourth, color-space reads will be converted to nucleotides -when SAM output is generated. Color errors are corrected in this -process. Please note that like MAQ, BWA cannot make use of the primer -base and the first color. - -In addition, the calculation of mapping quality is also improved a -little bit, although end-users may barely observe the difference. - -(0.4.6: 9 March 2009, r915) - - - -Beta Release 0.4.5 (18 Feburary, 2009) --------------------------------------- - -Not much happened, but I think it would be good to let the users use the -latest version. - -Notable changes (Thank Bob Handsaker for catching the two bugs): - - * Improved bounary check. Previous version may still give incorrect - alignment coordinates in rare cases. - - * Fixed a bug in SW alignment when no residue matches. This only - affects the 'sampe' command. - - * Robustly estimate insert size without setting the maximum on the - command line. Since this release 'sampe -a' only has an effect if - there are not enough good pairs to infer the insert size - distribution. - - * Reduced false PE alignments a little bit by using the inferred insert - size distribution. This fix may be more important for long insert - size libraries. - -(0.4.5: 18 Feburary 2009, r829) - - - -Beta Release 0.4.4 (15 Feburary, 2009) --------------------------------------- - -This is mainly a bug fix release. Notable changes are: - - * Imposed boundary check for extracting subsequence from the - genome. Previously this causes memory problem in rare cases. - - * Fixed a bug in failing to find whether an alignment overlapping with - N on the genome. - - * Changed MD tag to meet the latest SAM specification. - -(0.4.4: 15 Feburary 2009, r815) - - - -Beta Release 0.4.3 (22 January, 2009) ------------------------------------- - -Notable changes: - - * Treat an ambiguous base N as a mismatch. Previous versions will not - map reads containing any N. - - * Automatically choose the maximum allowed number of differences. This - is important when reads of different lengths are mixed together. - - * Print mate coordinate if only one end is unmapped. - - * Generate MD tag. This tag encodes the mismatching positions and the - reference bases at these positions. Deletions from the reference will - also be printed. - - * Optionally dump multiple hits from samse, in another concise format - rather than SAM. - - * Optionally disable iterative search. This is VERY SLOOOOW, though. - - * Fixed a bug in generate SAM. - -(0.4.3: 22 January 2009, r787) - - - -Beta Release 0.4.2 (9 January, 2009) ------------------------------------- - -Aaron Quinlan found a bug in the indexer: the bwa indexer segfaults if -there are no comment texts in the FASTA header. This is a critical -bug. Nothing else was changed. - -(0.4.2: 9 January 2009, r769) - - - -Beta Release 0.4.1 (7 January, 2009) ------------------------------------- - -I am sorry for the quick updates these days. I like to set a milestone -for BWA and this release seems to be. For paired end reads, BWA also -does Smith-Waterman alignment for an unmapped read whose mate can be -mapped confidently. With this strategy BWA achieves similar accuracy to -maq. Benchmark is also updated accordingly. - -(0.4.1: 7 January 2009, r760) - - - -Beta Release 0.4.0 (6 January, 2009) ------------------------------------- - -In comparison to the release two days ago, this release is mainly tuned -for performance with some tricks I learnt from Bowtie. However, as the -indexing format has also been changed, I have to increase the version -number to 0.4.0 to emphasize that *DATABASE MUST BE RE-INDEXED* with -'bwa index'. - - * Improved the speed by about 20%. - - * Added multi-threading to 'bwa aln'. - -(0.4.0: 6 January 2009, r756) - - - -Beta Release 0.3.0 (4 January, 2009) ------------------------------------- - - * Added paired-end support by separating SA calculation and alignment - output. - - * Added SAM output. - - * Added evaluation to the documentation. - -(0.3.0: 4 January 2009, r741) - - - -Beta Release 0.2.0 (15 Augusst, 2008) -------------------------------------- - - * Take the subsequence at the 5'-end as seed. Seeding strategy greatly - improves the speed for long reads, at the cost of missing a few true - hits that contain many differences in the seed. Seeding also increase - the memory by 800MB. - - * Fixed a bug which may miss some gapped alignments. Fixing the bug - also slows the speed a little. - -(0.2.0: 15 August 2008, r428) - - - -Beta Release 0.1.6 (08 Augusst, 2008) -------------------------------------- - - * Give accurate CIGAR string. - - * Add a simple interface to SW/NW alignment - -(0.1.6: 08 August 2008, r414) - - - -Beta Release 0.1.5 (27 July, 2008) ----------------------------------- - - * Improve the speed. This version is expected to give the same results. - -(0.1.5: 27 July 2008, r400) - - - -Beta Release 0.1.4 (22 July, 2008) ----------------------------------- - - * Fixed a bug which may cause missing gapped alignments. - - * More clearly define what alignments can be found by BWA (See - manual). Now BWA runs a little slower because it will visit more - potential gapped alignments. - - * A bit code clean up. - -(0.1.4: 22 July 2008, r387) - - - -Beta Release 0.1.3 (21 July, 2008) ----------------------------------- - -Improve the speed with some tricks on retrieving occurences. The results -should be exactly the same as that of 0.1.2. - -(0.1.3: 21 July 2008, r382) - - - -Beta Release 0.1.2 (17 July, 2008) ----------------------------------- - -Support gapped alignment. Codes for ungapped alignment has been removed. - -(0.1.2: 17 July 2008, r371) - - - -Beta Release 0.1.1 (03 June, 2008) ------------------------------------ - -This is the first release of BWA, Burrows-Wheeler Alignment tool. Please -read man page for more information about this software. - -(0.1.1: 03 June 2008, r349) diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/QSufSort.c --- a/bwa-0.7.9a/QSufSort.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,402 +0,0 @@ -/* QSufSort.c - - Original source from qsufsort.c - - Copyright 1999, N. Jesper Larsson, all rights reserved. - - This file contains an implementation of the algorithm presented in "Faster - Suffix Sorting" by N. Jesper Larsson (jesper@cs.lth.se) and Kunihiko - Sadakane (sada@is.s.u-tokyo.ac.jp). - - This software may be used freely for any purpose. However, when distributed, - the original source must be clearly stated, and, when the source code is - distributed, the copyright notice must be retained and any alterations in - the code must be clearly marked. No warranty is given regarding the quality - of this software. - - Modified by Wong Chi-Kwong, 2004 - - Changes summary: - Used long variable and function names - - Removed global variables - - Replace pointer references with array references - - Used insertion sort in place of selection sort and increased insertion sort threshold - - Reconstructing suffix array from inverse becomes an option - - Add handling where end-of-text symbol is not necessary < all characters - - Removed codes for supporting alphabet size > number of characters - - No warrenty is given regarding the quality of the modifications. - -*/ - - -#include -#include -#include -#include "QSufSort.h" - -#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) -#define med3(a, b, c) ( ac ? b : a>c ? c : a)) -#define swap(a, b, t); t = a; a = b; b = t; - -// Static functions -static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, - const qsint_t highestPos, const qsint_t numSortedChar); -static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, - const qsint_t highestPos, const qsint_t numSortedChar); -static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, - const qsint_t highestPos, const qsint_t numSortedChar); -static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize); -static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, - const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated); - -/* Makes suffix array p of x. x becomes inverse of p. p and x are both of size - n+1. Contents of x[0...n-1] are integers in the range l...k-1. Original - contents of x[n] is disregarded, the n-th symbol being regarded as - end-of-string smaller than all other symbols.*/ -void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, - const qsint_t smallestInputSymbol, const int skipTransform) -{ - qsint_t i, j; - qsint_t s, negatedSortedGroupLength; - qsint_t numSymbolAggregated; - qsint_t numSortedPos = 1; - qsint_t newAlphabetSize; - - if (!skipTransform) { - /* bucketing possible*/ - newAlphabetSize = QSufSortTransform(V, I, numChar, largestInputSymbol, smallestInputSymbol, - numChar, &numSymbolAggregated); - QSufSortBucketSort(V, I, numChar, newAlphabetSize); - I[0] = -1; - V[numChar] = 0; - numSortedPos = numSymbolAggregated; - } - - while ((qsint_t)(I[0]) >= -(qsint_t)numChar) { - i = 0; - negatedSortedGroupLength = 0; - do { - s = I[i]; - if (s < 0) { - i -= s; /* skip over sorted group.*/ - negatedSortedGroupLength += s; - } else { - if (negatedSortedGroupLength) { - I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine preceding sorted groups */ - negatedSortedGroupLength = 0; - } - j = V[s] + 1; - QSufSortSortSplit(V, I, i, j - 1, numSortedPos); - i = j; - } - } while (i <= numChar); - if (negatedSortedGroupLength) { - /* array ends with a sorted group.*/ - I[i+negatedSortedGroupLength] = negatedSortedGroupLength; /* combine sorted groups at end of I.*/ - } - numSortedPos *= 2; /* double sorted-depth.*/ - } -} - -void QSufSortGenerateSaFromInverse(const qsint_t* V, qsint_t* __restrict I, const qsint_t numChar) -{ - qsint_t i; - for (i=0; i<=numChar; i++) - I[V[i]] = i + 1; -} - -/* Sorting routine called for each unsorted group. Sorts the array of integers - (suffix numbers) of length n starting at p. The algorithm is a ternary-split - quicksort taken from Bentley & McIlroy, "Engineering a Sort Function", - Software -- Practice and Experience 23(11), 1249-1265 (November 1993). This - function is based on Program 7.*/ -static void QSufSortSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, - const qsint_t highestPos, const qsint_t numSortedChar) { - - qsint_t a, b, c, d; - qsint_t l, m; - qsint_t f, v, s, t; - qsint_t tmp; - qsint_t numItem; - - numItem = highestPos - lowestPos + 1; - - if (numItem <= INSERT_SORT_NUM_ITEM) { - QSufSortInsertSortSplit(V, I, lowestPos, highestPos, numSortedChar); - return; - } - - v = QSufSortChoosePivot(V, I, lowestPos, highestPos, numSortedChar); - - a = b = lowestPos; - c = d = highestPos; - - while (1) { - while (c >= b && (f = KEY(V, I, b, numSortedChar)) <= v) { - if (f == v) { - swap(I[a], I[b], tmp); - a++; - } - b++; - } - while (c >= b && (f = KEY(V, I, c, numSortedChar)) >= v) { - if (f == v) { - swap(I[c], I[d], tmp); - d--; - } - c--; - } - if (b > c) - break; - swap(I[b], I[c], tmp); - b++; - c--; - } - - s = a - lowestPos; - t = b - a; - s = min(s, t); - for (l = lowestPos, m = b - s; m < b; l++, m++) { - swap(I[l], I[m], tmp); - } - - s = d - c; - t = highestPos - d; - s = min(s, t); - for (l = b, m = highestPos - s + 1; m <= highestPos; l++, m++) { - swap(I[l], I[m], tmp); - } - - s = b - a; - t = d - c; - if (s > 0) - QSufSortSortSplit(V, I, lowestPos, lowestPos + s - 1, numSortedChar); - - // Update group number for equal portion - a = lowestPos + s; - b = highestPos - t; - if (a == b) { - // Sorted group - V[I[a]] = a; - I[a] = -1; - } else { - // Unsorted group - for (c=a; c<=b; c++) - V[I[c]] = b; - } - - if (t > 0) - QSufSortSortSplit(V, I, highestPos - t + 1, highestPos, numSortedChar); - -} - -/* Algorithm by Bentley & McIlroy.*/ -static qsint_t QSufSortChoosePivot(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, - const qsint_t highestPos, const qsint_t numSortedChar) { - - qsint_t m; - qsint_t keyl, keym, keyn; - qsint_t key1, key2, key3; - qsint_t s; - qsint_t numItem; - - numItem = highestPos - lowestPos + 1; - - m = lowestPos + numItem / 2; - - s = numItem / 8; - key1 = KEY(V, I, lowestPos, numSortedChar); - key2 = KEY(V, I, lowestPos+s, numSortedChar); - key3 = KEY(V, I, lowestPos+2*s, numSortedChar); - keyl = med3(key1, key2, key3); - key1 = KEY(V, I, m-s, numSortedChar); - key2 = KEY(V, I, m, numSortedChar); - key3 = KEY(V, I, m+s, numSortedChar); - keym = med3(key1, key2, key3); - key1 = KEY(V, I, highestPos-2*s, numSortedChar); - key2 = KEY(V, I, highestPos-s, numSortedChar); - key3 = KEY(V, I, highestPos, numSortedChar); - keyn = med3(key1, key2, key3); - - return med3(keyl, keym, keyn); - - -} - -/* Quadratic sorting method to use for small subarrays. */ -static void QSufSortInsertSortSplit(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t lowestPos, - const qsint_t highestPos, const qsint_t numSortedChar) -{ - qsint_t i, j; - qsint_t tmpKey, tmpPos; - qsint_t numItem; - qsint_t key[INSERT_SORT_NUM_ITEM], pos[INSERT_SORT_NUM_ITEM]; - qsint_t negativeSortedLength; - qsint_t groupNum; - - numItem = highestPos - lowestPos + 1; - - for (i=0; i0 && key[j-1] > tmpKey; j--) { - key[j] = key[j-1]; - pos[j] = pos[j-1]; - } - key[j] = tmpKey; - pos[j] = tmpPos; - } - - negativeSortedLength = -1; - - i = numItem - 1; - groupNum = highestPos; - while (i > 0) { - I[i+lowestPos] = pos[i]; - V[I[i+lowestPos]] = groupNum; - if (key[i-1] == key[i]) { - negativeSortedLength = 0; - } else { - if (negativeSortedLength < 0) - I[i+lowestPos] = negativeSortedLength; - groupNum = i + lowestPos - 1; - negativeSortedLength--; - } - i--; - } - - I[lowestPos] = pos[0]; - V[I[lowestPos]] = groupNum; - if (negativeSortedLength < 0) - I[lowestPos] = negativeSortedLength; -} - -/* Bucketsort for first iteration. - - Input: x[0...n-1] holds integers in the range 1...k-1, all of which appear - at least once. x[n] is 0. (This is the corresponding output of transform.) k - must be at most n+1. p is array of size n+1 whose contents are disregarded. - - Output: x is V and p is I after the initial sorting stage of the refined - suffix sorting algorithm.*/ - -static void QSufSortBucketSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t alphabetSize) -{ - qsint_t i, c; - qsint_t d; - qsint_t groupNum; - qsint_t currentIndex; - - // mark linked list empty - for (i=0; i0; i--) { - c = I[i-1]; - d = (qsint_t)(V[c]); - groupNum = currentIndex; - V[c] = groupNum; - if (d >= 0) { - I[currentIndex] = c; - while (d >= 0) { - c = d; - d = V[c]; - V[c] = groupNum; - currentIndex--; - I[currentIndex] = c; - } - } else { - // sorted group - I[currentIndex] = -1; - } - currentIndex--; - } -} - -/* Transforms the alphabet of x by attempting to aggregate several symbols into - one, while preserving the suffix order of x. The alphabet may also be - compacted, so that x on output comprises all integers of the new alphabet - with no skipped numbers. - - Input: x is an array of size n+1 whose first n elements are positive - integers in the range l...k-1. p is array of size n+1, used for temporary - storage. q controls aggregation and compaction by defining the maximum intue - for any symbol during transformation: q must be at least k-l; if q<=n, - compaction is guaranteed; if k-l>n, compaction is never done; if q is - INT_MAX, the maximum number of symbols are aggregated into one. - - Output: Returns an integer j in the range 1...q representing the size of the - new alphabet. If j<=n+1, the alphabet is compacted. The global variable r is - set to the number of old symbols grouped into one. Only x[n] is 0.*/ -static qsint_t QSufSortTransform(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, - const qsint_t smallestInputSymbol, const qsint_t maxNewAlphabetSize, qsint_t *numSymbolAggregated) -{ - qsint_t c, i, j; - qsint_t a; // numSymbolAggregated - qsint_t mask; - qsint_t minSymbolInChunk = 0, maxSymbolInChunk = 0; - qsint_t newAlphabetSize; - qsint_t maxNumInputSymbol, maxNumBit, maxSymbol; - - maxNumInputSymbol = largestInputSymbol - smallestInputSymbol + 1; - - for (maxNumBit = 0, i = maxNumInputSymbol; i; i >>= 1) ++maxNumBit; - maxSymbol = QSINT_MAX >> maxNumBit; - - c = maxNumInputSymbol; - for (a = 0; a < numChar && maxSymbolInChunk <= maxSymbol && c <= maxNewAlphabetSize; a++) { - minSymbolInChunk = (minSymbolInChunk << maxNumBit) | (V[a] - smallestInputSymbol + 1); - maxSymbolInChunk = c; - c = (maxSymbolInChunk << maxNumBit) | maxNumInputSymbol; - } - - mask = (1 << (a-1) * maxNumBit) - 1; /* mask masks off top old symbol from chunk.*/ - V[numChar] = smallestInputSymbol - 1; /* emulate zero terminator.*/ - - /* bucketing possible, compact alphabet.*/ - for (i=0; i<=maxSymbolInChunk; i++) - I[i] = 0; /* zero transformation table.*/ - c = minSymbolInChunk; - for (i=a; i<=numChar; i++) { - I[c] = 1; /* mark used chunk symbol.*/ - c = ((c & mask) << maxNumBit) | (V[i] - smallestInputSymbol + 1); /* shift in next old symbol in chunk.*/ - } - for (i=1; i number of characters - - No warrenty is given regarding the quality of the modifications. - -*/ - -#ifndef __QSUFSORT_H__ -#define __QSUFSORT_H__ - -#include - -#define KEY(V, I, p, h) ( V[ I[p] + h ] ) -#define INSERT_SORT_NUM_ITEM 16 - -typedef int64_t qsint_t; -#define QSINT_MAX INT64_MAX - -void QSufSortSuffixSort(qsint_t* __restrict V, qsint_t* __restrict I, const qsint_t numChar, const qsint_t largestInputSymbol, - const qsint_t smallestInputSymbol, const int skipTransform); -void QSufSortGenerateSaFromInverse(const qsint_t *V, qsint_t* __restrict I, const qsint_t numChar); - - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/README.md --- a/bwa-0.7.9a/README.md Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,182 +0,0 @@ -##Getting started - - git clone https://github.com/lh3/bwa.git - cd bwa; make - ./bwa index ref.fa - ./bwa mem ref.fa read-se.fq.gz | gzip -3 > aln-se.sam.gz - ./bwa mem ref.fa read1.fq read2.fq | gzip -3 > aln-pe.sam.gz - -##Introduction - -BWA is a software package for mapping DNA sequences against a large reference -genome, such as the human genome. It consists of three algorithms: -BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina -sequence reads up to 100bp, while the rest two for longer sequences ranged from -70bp to a few megabases. BWA-MEM and BWA-SW share similar features such as the -support of long reads and chimeric alignment, but BWA-MEM, which is the latest, -is generally recommended as it is faster and more accurate. BWA-MEM also has -better performance than BWA-backtrack for 70-100bp Illumina reads. - -For all the algorithms, BWA first needs to construct the FM-index for the -reference genome (the **index** command). Alignment algorithms are invoked with -different sub-commands: **aln/samse/sampe** for BWA-backtrack, -**bwasw** for BWA-SW and **mem** for the BWA-MEM algorithm. - -##Availability - -BWA is released under [GPLv3][1]. The latest source code is [freely -available at github][2]. Released packages can [be downloaded][3] at -SourceForge. After you acquire the source code, simply use `make` to compile -and copy the single executable `bwa` to the destination you want. The only -dependency required to build BWA is [zlib][14]. - -##Seeking helps - -The detailed usage is described in the man page available together with the -source code. You can use `man ./bwa.1` to view the man page in a terminal. The -[HTML version][4] of the man page can be found at the [BWA website][5]. If you -have questions about BWA, you may [sign up the mailing list][6] and then send -the questions to [bio-bwa-help@sourceforge.net][7]. You may also ask questions -in forums such as [BioStar][8] and [SEQanswers][9]. - -##Citing BWA - -* Li H. and Durbin R. (2009) Fast and accurate short read alignment with - Burrows-Wheeler transform. *Bioinformatics*, **25**, 1754-1760. [PMID: - [19451168][10]]. (if you use the BWA-backtrack algorithm) - -* Li H. and Durbin R. (2010) Fast and accurate long-read alignment with - Burrows-Wheeler transform. *Bioinformatics*, **26**, 589-595. [PMID: - [20080505][11]]. (if you use the BWA-SW algorithm) - -* Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs - with BWA-MEM. [arXiv:1303.3997v2][12] [q-bio.GN]. (if you use the BWA-MEM - algorithm or the **fastmap** command, or want to cite the whole BWA package) - -Please note that the last reference is a preprint hosted at [arXiv.org][13]. I -do not have plan to submit it to a peer-reviewed journal in the near future. - -##Frequently asked questions (FAQs) - -1. [What types of data does BWA work with?](#type) -2. [Why does a read appear multiple times in the output SAM?](#multihit) -3. [Does BWA work on reference sequences longer than 4GB in total?](#4gb) -4. [Why can one read in a pair has high mapping quality but the other has zero?](#pe0) -5. [How can a BWA-backtrack alignment stands out of the end of a chromosome?](#endref) -6. [How to map sequences to GRCh38 with ALT contigs?](#h38) - -####1. What types of data does BWA work with? - -BWA works with a variety types of DNA sequence data, though the optimal -algorithm and setting may vary. The following list gives the recommended -settings: - -* Illumina/454/IonTorrent single-end reads longer than ~70bp or assembly - contigs up to a few megabases mapped to a close related reference genome: - - bwa mem ref.fa reads.fq > aln.sam - -* Illumina single-end reads no longer than ~70bp: - - bwa aln ref.fa reads.fq > reads.sai; bwa samse ref.fa reads.sai reads.fq > aln-se.sam - -* Illumina/454/IonTorrent paired-end reads longer than ~70bp: - - bwa mem ref.fa read1.fq read2.fq > aln-pe.sam - -* Illumina paired-end reads no longer than ~70bp: - - bwa aln ref.fa read1.fq > read1.sai; bwa aln ref.fa read2.fq > read2.sai - bwa samse ref.fa reads.sai reads.fq > aln-pe.sam - -* PacBio subreads to a reference genome: - - bwa mem -x pacbio ref.fa reads.fq > aln.sam - -* PacBio subreads to themselves (the output is not SAM): - - bwa mem -x pbread reads.fq reads.fq > overlap.pas - -BWA-MEM is recommended for query sequences longer than ~70bp for a variety of -error rates (or sequence divergence). Generally, BWA-MEM is more tolerant with -errors given longer query sequences as the chance of missing all seeds is small. -As is shown above, with non-default settings, BWA-MEM works with PacBio subreads -with a sequencing error rate as high as ~15%. - -####2. Why does a read appear multiple times in the output SAM? - -BWA-SW and BWA-MEM perform local alignments. If there is a translocation, a gene -fusion or a long deletion, a read bridging the break point may have two hits, -occupying two lines in the SAM output. With the default setting of BWA-MEM, one -and only one line is primary and is soft clipped; other lines are tagged with -0x800 SAM flag (supplementary alignment) and are hard clipped. - -####3. Does BWA work on reference sequences longer than 4GB in total? - -Yes. Since 0.6.x, all BWA algorithms work with a genome with total length over -4GB. However, individual chromosome should not be longer than 2GB. - -####4. Why can one read in a pair has high mapping quality but the other has zero? - -This is correct. Mapping quality is assigned for individual read, not for a read -pair. It is possible that one read can be mapped unambiguously, but its mate -falls in a tandem repeat and thus its accurate position cannot be determined. - -####5. How can a BWA-backtrack alignment stands out of the end of a chromosome? - -Internally BWA concatenates all reference sequences into one long sequence. A -read may be mapped to the junction of two adjacent reference sequences. In this -case, BWA-backtrack will flag the read as unmapped (0x4), but you will see -position, CIGAR and all the tags. A similar issue may occur to BWA-SW alignment -as well. BWA-MEM does not have this problem. - -####6. How to map sequences to GRCh38 with ALT contigs? - -BWA-backtrack and BWA-MEM partially support mapping to a reference containing -ALT contigs that represent alternative alleles highly divergent from the -reference genome. - - # download the K8 executable required by bwa-helper.js - wget http://sourceforge.net/projects/lh3/files/k8/k8-0.2.1.tar.bz2/download - tar -jxf k8-0.2.1.tar.bz2 - - # download the ALT-to-GRCh38 alignment in the SAM format - wget http://sourceforge.net/projects/bio-bwa/files/hs38.alt.sam.gz/download - - # download the GRCh38 sequences with ALT contigs - wget ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/GCA_000001405.15_GRCh38_full_analysis_set.fna.gz - - # index and mapping - bwa index -p hs38a GCA_000001405.15_GRCh38_full_analysis_set.fna.gz - bwa mem -h50 hs38a reads.fq | ./k8-linux bwa-helper.js genalt hs38.alt.sam.gz > out.sam - -Here, option `-h50` asks bwa-mem to output multiple hits in the XA tag if the -read has 50 or fewer hits. For each SAM line containing the XA tag, -`bwa-helper.js genalt` decodes the alignments in the XA tag, groups hits lifted -to the same chromosomal region, adjusts mapping quality and outputs all the -hits overlapping the reported hit. A read may be mapped to both the primary -assembly and one or more ALT contigs all with high mapping quality. - -Note that this procedure assumes reads are single-end and may miss hits to -highly repetitive regions as these hits will not be reported with option -`-h50`. `bwa-helper.js` is a prototype implementation not recommended for -production uses. - - - -[1]: http://en.wikipedia.org/wiki/GNU_General_Public_License -[2]: https://github.com/lh3/bwa -[3]: http://sourceforge.net/projects/bio-bwa/files/ -[4]: http://bio-bwa.sourceforge.net/bwa.shtml -[5]: http://bio-bwa.sourceforge.net/ -[6]: https://lists.sourceforge.net/lists/listinfo/bio-bwa-help -[7]: mailto:bio-bwa-help@sourceforge.net -[8]: http://biostars.org -[9]: http://seqanswers.com/ -[10]: http://www.ncbi.nlm.nih.gov/pubmed/19451168 -[11]: http://www.ncbi.nlm.nih.gov/pubmed/20080505 -[12]: http://arxiv.org/abs/1303.3997 -[13]: http://arxiv.org/ -[14]: http://zlib.net/ -[15]: https://github.com/lh3/bwa/tree/mem -[16]: ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Homo_sapiens/GRCh38/seqs_for_alignment_pipelines/ diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bamlite.c --- a/bwa-0.7.9a/bamlite.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,210 +0,0 @@ -#include -#include -#include -#include -#include -#include "bamlite.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -/********************* - * from bam_endian.c * - *********************/ - -static inline int bam_is_big_endian() -{ - long one= 1; - return !(*((char *)(&one))); -} -static inline uint16_t bam_swap_endian_2(uint16_t v) -{ - return (uint16_t)(((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8)); -} -static inline void *bam_swap_endian_2p(void *x) -{ - *(uint16_t*)x = bam_swap_endian_2(*(uint16_t*)x); - return x; -} -static inline uint32_t bam_swap_endian_4(uint32_t v) -{ - v = ((v & 0x0000FFFFU) << 16) | (v >> 16); - return ((v & 0x00FF00FFU) << 8) | ((v & 0xFF00FF00U) >> 8); -} -static inline void *bam_swap_endian_4p(void *x) -{ - *(uint32_t*)x = bam_swap_endian_4(*(uint32_t*)x); - return x; -} -static inline uint64_t bam_swap_endian_8(uint64_t v) -{ - v = ((v & 0x00000000FFFFFFFFLLU) << 32) | (v >> 32); - v = ((v & 0x0000FFFF0000FFFFLLU) << 16) | ((v & 0xFFFF0000FFFF0000LLU) >> 16); - return ((v & 0x00FF00FF00FF00FFLLU) << 8) | ((v & 0xFF00FF00FF00FF00LLU) >> 8); -} -static inline void *bam_swap_endian_8p(void *x) -{ - *(uint64_t*)x = bam_swap_endian_8(*(uint64_t*)x); - return x; -} - -/************** - * from bam.c * - **************/ - -int bam_is_be; - -bam_header_t *bam_header_init() -{ - bam_is_be = bam_is_big_endian(); - return (bam_header_t*)calloc(1, sizeof(bam_header_t)); -} - -void bam_header_destroy(bam_header_t *header) -{ - int32_t i; - if (header == 0) return; - if (header->target_name) { - for (i = 0; i < header->n_targets; ++i) - if (header->target_name[i]) free(header->target_name[i]); - if (header->target_len) free(header->target_len); - free(header->target_name); - } - if (header->text) free(header->text); - free(header); -} - -bam_header_t *bam_header_read(bamFile fp) -{ - bam_header_t *header; - char buf[4]; - int magic_len; - int32_t i = 1, name_len; - // read "BAM1" - magic_len = bam_read(fp, buf, 4); - if (magic_len != 4 || strncmp(buf, "BAM\001", 4) != 0) { - fprintf(stderr, "[bam_header_read] invalid BAM binary header (this is not a BAM file).\n"); - return NULL; - } - header = bam_header_init(); - // read plain text and the number of reference sequences - if (bam_read(fp, &header->l_text, 4) != 4) goto fail; - if (bam_is_be) bam_swap_endian_4p(&header->l_text); - header->text = (char*)calloc(header->l_text + 1, 1); - if (bam_read(fp, header->text, header->l_text) != header->l_text) goto fail; - if (bam_read(fp, &header->n_targets, 4) != 4) goto fail; - if (bam_is_be) bam_swap_endian_4p(&header->n_targets); - // read reference sequence names and lengths - header->target_name = (char**)calloc(header->n_targets, sizeof(char*)); - header->target_len = (uint32_t*)calloc(header->n_targets, 4); - for (i = 0; i != header->n_targets; ++i) { - if (bam_read(fp, &name_len, 4) != 4) goto fail; - if (bam_is_be) bam_swap_endian_4p(&name_len); - header->target_name[i] = (char*)calloc(name_len, 1); - if (bam_read(fp, header->target_name[i], name_len) != name_len) { - goto fail; - } - if (bam_read(fp, &header->target_len[i], 4) != 4) goto fail; - if (bam_is_be) bam_swap_endian_4p(&header->target_len[i]); - } - return header; - fail: - bam_header_destroy(header); - return NULL; -} - -static void swap_endian_data(const bam1_core_t *c, int data_len, uint8_t *data) -{ - uint8_t *s; - uint32_t i, *cigar = (uint32_t*)(data + c->l_qname); - s = data + c->n_cigar*4 + c->l_qname + c->l_qseq + (c->l_qseq + 1)/2; - for (i = 0; i < c->n_cigar; ++i) bam_swap_endian_4p(&cigar[i]); - while (s < data + data_len) { - uint8_t type; - s += 2; // skip key - type = toupper(*s); ++s; // skip type - if (type == 'C' || type == 'A') ++s; - else if (type == 'S') { bam_swap_endian_2p(s); s += 2; } - else if (type == 'I' || type == 'F') { bam_swap_endian_4p(s); s += 4; } - else if (type == 'D') { bam_swap_endian_8p(s); s += 8; } - else if (type == 'Z' || type == 'H') { while (*s) ++s; ++s; } - } -} - -int bam_read1(bamFile fp, bam1_t *b) -{ - bam1_core_t *c = &b->core; - int32_t block_len, ret, i; - uint32_t x[8]; - - if ((ret = bam_read(fp, &block_len, 4)) != 4) { - if (ret == 0) return -1; // normal end-of-file - else return -2; // truncated - } - if (bam_read(fp, x, sizeof(bam1_core_t)) != sizeof(bam1_core_t)) return -3; - if (bam_is_be) { - bam_swap_endian_4p(&block_len); - for (i = 0; i < 8; ++i) bam_swap_endian_4p(x + i); - } - c->tid = x[0]; c->pos = x[1]; - c->bin = x[2]>>16; c->qual = x[2]>>8&0xff; c->l_qname = x[2]&0xff; - c->flag = x[3]>>16; c->n_cigar = x[3]&0xffff; - c->l_qseq = x[4]; - c->mtid = x[5]; c->mpos = x[6]; c->isize = x[7]; - b->data_len = block_len - sizeof(bam1_core_t); - if (b->m_data < b->data_len) { - b->m_data = b->data_len; - kroundup32(b->m_data); - b->data = (uint8_t*)realloc(b->data, b->m_data); - } - if (bam_read(fp, b->data, b->data_len) != b->data_len) return -4; - b->l_aux = b->data_len - c->n_cigar * 4 - c->l_qname - c->l_qseq - (c->l_qseq+1)/2; - if (bam_is_be) swap_endian_data(c, b->data_len, b->data); - return 4 + block_len; -} - - -#ifdef USE_VERBOSE_ZLIB_WRAPPERS -// Versions of gzopen, gzread and gzclose that print up error messages - -gzFile bamlite_gzopen(const char *fn, const char *mode) { - gzFile fp; - if (strcmp(fn, "-") == 0) { - fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); - if (!fp) { - fprintf(stderr, "Couldn't open %s : %s", - (strstr(mode, "r"))? "stdin" : "stdout", - strerror(errno)); - } - return fp; - } - if ((fp = gzopen(fn, mode)) == 0) { - fprintf(stderr, "Couldn't open %s : %s\n", fn, - errno ? strerror(errno) : "Out of memory"); - } - return fp; -} - -int bamlite_gzread(gzFile file, void *ptr, unsigned int len) { - int ret = gzread(file, ptr, len); - - if (ret < 0) { - int errnum = 0; - const char *msg = gzerror(file, &errnum); - fprintf(stderr, "gzread error: %s\n", - Z_ERRNO == errnum ? strerror(errno) : msg); - } - return ret; -} - -int bamlite_gzclose(gzFile file) { - int ret = gzclose(file); - if (Z_OK != ret) { - fprintf(stderr, "gzclose error: %s\n", - Z_ERRNO == ret ? strerror(errno) : zError(ret)); - } - - return ret; -} -#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bamlite.h --- a/bwa-0.7.9a/bamlite.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ -#ifndef BAMLITE_H_ -#define BAMLITE_H_ - -#include -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#define USE_VERBOSE_ZLIB_WRAPPERS - -typedef gzFile bamFile; -#ifdef USE_VERBOSE_ZLIB_WRAPPERS -/* These print error messages on failure */ -# define bam_open(fn, mode) bamlite_gzopen(fn, mode) -# define bam_dopen(fd, mode) gzdopen(fd, mode) -# define bam_close(fp) bamlite_gzclose(fp) -# define bam_read(fp, buf, size) bamlite_gzread(fp, buf, size) -#else -# define bam_open(fn, mode) gzopen(fn, mode) -# define bam_dopen(fd, mode) gzdopen(fd, mode) -# define bam_close(fp) gzclose(fp) -# define bam_read(fp, buf, size) gzread(fp, buf, size) -#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ - -typedef struct { - int32_t n_targets; - char **target_name; - uint32_t *target_len; - size_t l_text, n_text; - char *text; -} bam_header_t; - -#define BAM_FPAIRED 1 -#define BAM_FPROPER_PAIR 2 -#define BAM_FUNMAP 4 -#define BAM_FMUNMAP 8 -#define BAM_FREVERSE 16 -#define BAM_FMREVERSE 32 -#define BAM_FREAD1 64 -#define BAM_FREAD2 128 -#define BAM_FSECONDARY 256 -#define BAM_FQCFAIL 512 -#define BAM_FDUP 1024 - -#define BAM_CIGAR_SHIFT 4 -#define BAM_CIGAR_MASK ((1 << BAM_CIGAR_SHIFT) - 1) - -#define BAM_CMATCH 0 -#define BAM_CINS 1 -#define BAM_CDEL 2 -#define BAM_CREF_SKIP 3 -#define BAM_CSOFT_CLIP 4 -#define BAM_CHARD_CLIP 5 -#define BAM_CPAD 6 - -typedef struct { - int32_t tid; - int32_t pos; - uint32_t bin:16, qual:8, l_qname:8; - uint32_t flag:16, n_cigar:16; - int32_t l_qseq; - int32_t mtid; - int32_t mpos; - int32_t isize; -} bam1_core_t; - -typedef struct { - bam1_core_t core; - int l_aux, data_len, m_data; - uint8_t *data; -} bam1_t; - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#define bam1_strand(b) (((b)->core.flag&BAM_FREVERSE) != 0) -#define bam1_mstrand(b) (((b)->core.flag&BAM_FMREVERSE) != 0) -#define bam1_cigar(b) ((uint32_t*)((b)->data + (b)->core.l_qname)) -#define bam1_qname(b) ((char*)((b)->data)) -#define bam1_seq(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname) -#define bam1_qual(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (((b)->core.l_qseq + 1)>>1)) -#define bam1_seqi(s, i) ((s)[(i)/2] >> 4*(1-(i)%2) & 0xf) -#define bam1_aux(b) ((b)->data + (b)->core.n_cigar*4 + (b)->core.l_qname + (b)->core.l_qseq + ((b)->core.l_qseq + 1)/2) - -#define bam_init1() ((bam1_t*)calloc(1, sizeof(bam1_t))) -#define bam_destroy1(b) do { \ - if (b) { free((b)->data); free(b); } \ - } while (0) - -extern int bam_is_be; - -#ifdef __cplusplus -extern "C" { -#endif - - bam_header_t *bam_header_init(void); - void bam_header_destroy(bam_header_t *header); - bam_header_t *bam_header_read(bamFile fp); - int bam_read1(bamFile fp, bam1_t *b); - -#ifdef USE_VERBOSE_ZLIB_WRAPPERS - gzFile bamlite_gzopen(const char *fn, const char *mode); - int bamlite_gzread(gzFile file, void *ptr, unsigned int len); - int bamlite_gzclose(gzFile file); -#endif /* USE_VERBOSE_ZLIB_WRAPPERS */ - -#ifdef __cplusplus -} -#endif - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bntseq.c --- a/bwa-0.7.9a/bntseq.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,412 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include -#include -#include "bntseq.h" -#include "utils.h" - -#include "kseq.h" -KSEQ_DECLARE(gzFile) - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -unsigned char nst_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5 /*'-'*/, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -void bns_dump(const bntseq_t *bns, const char *prefix) -{ - char str[1024]; - FILE *fp; - int i; - { // dump .ann - strcpy(str, prefix); strcat(str, ".ann"); - fp = xopen(str, "w"); - err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->seed); - for (i = 0; i != bns->n_seqs; ++i) { - bntann1_t *p = bns->anns + i; - err_fprintf(fp, "%d %s", p->gi, p->name); - if (p->anno[0]) err_fprintf(fp, " %s\n", p->anno); - else err_fprintf(fp, "\n"); - err_fprintf(fp, "%lld %d %d\n", (long long)p->offset, p->len, p->n_ambs); - } - err_fflush(fp); - err_fclose(fp); - } - { // dump .amb - strcpy(str, prefix); strcat(str, ".amb"); - fp = xopen(str, "w"); - err_fprintf(fp, "%lld %d %u\n", (long long)bns->l_pac, bns->n_seqs, bns->n_holes); - for (i = 0; i != bns->n_holes; ++i) { - bntamb1_t *p = bns->ambs + i; - err_fprintf(fp, "%lld %d %c\n", (long long)p->offset, p->len, p->amb); - } - err_fflush(fp); - err_fclose(fp); - } -} - -bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename) -{ - char str[1024]; - FILE *fp; - const char *fname; - bntseq_t *bns; - long long xx; - int i; - int scanres; - bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); - { // read .ann - fp = xopen(fname = ann_filename, "r"); - scanres = fscanf(fp, "%lld%d%u", &xx, &bns->n_seqs, &bns->seed); - if (scanres != 3) goto badread; - bns->l_pac = xx; - bns->anns = (bntann1_t*)calloc(bns->n_seqs, sizeof(bntann1_t)); - for (i = 0; i < bns->n_seqs; ++i) { - bntann1_t *p = bns->anns + i; - char *q = str; - int c; - // read gi and sequence name - scanres = fscanf(fp, "%u%s", &p->gi, str); - if (scanres != 2) goto badread; - p->name = strdup(str); - // read fasta comments - while (str - q < sizeof(str) - 1 && (c = fgetc(fp)) != '\n' && c != EOF) *q++ = c; - while (c != '\n' && c != EOF) c = fgetc(fp); - if (c == EOF) { - scanres = EOF; - goto badread; - } - *q = 0; - if (q - str > 1) p->anno = strdup(str + 1); // skip leading space - else p->anno = strdup(""); - // read the rest - scanres = fscanf(fp, "%lld%d%d", &xx, &p->len, &p->n_ambs); - if (scanres != 3) goto badread; - p->offset = xx; - } - err_fclose(fp); - } - { // read .amb - int64_t l_pac; - int32_t n_seqs; - fp = xopen(fname = amb_filename, "r"); - scanres = fscanf(fp, "%lld%d%d", &xx, &n_seqs, &bns->n_holes); - if (scanres != 3) goto badread; - l_pac = xx; - xassert(l_pac == bns->l_pac && n_seqs == bns->n_seqs, "inconsistent .ann and .amb files."); - bns->ambs = bns->n_holes? (bntamb1_t*)calloc(bns->n_holes, sizeof(bntamb1_t)) : 0; - for (i = 0; i < bns->n_holes; ++i) { - bntamb1_t *p = bns->ambs + i; - scanres = fscanf(fp, "%lld%d%s", &xx, &p->len, str); - if (scanres != 3) goto badread; - p->offset = xx; - p->amb = str[0]; - } - err_fclose(fp); - } - { // open .pac - bns->fp_pac = xopen(pac_filename, "rb"); - } - return bns; - - badread: - if (EOF == scanres) { - err_fatal(__func__, "Error reading %s : %s\n", fname, ferror(fp) ? strerror(errno) : "Unexpected end of file"); - } - err_fatal(__func__, "Parse error reading %s\n", fname); -} - -bntseq_t *bns_restore(const char *prefix) -{ - char ann_filename[1024], amb_filename[1024], pac_filename[1024]; - strcat(strcpy(ann_filename, prefix), ".ann"); - strcat(strcpy(amb_filename, prefix), ".amb"); - strcat(strcpy(pac_filename, prefix), ".pac"); - return bns_restore_core(ann_filename, amb_filename, pac_filename); -} - -void bns_destroy(bntseq_t *bns) -{ - if (bns == 0) return; - else { - int i; - if (bns->fp_pac) err_fclose(bns->fp_pac); - free(bns->ambs); - for (i = 0; i < bns->n_seqs; ++i) { - free(bns->anns[i].name); - free(bns->anns[i].anno); - } - free(bns->anns); - free(bns); - } -} - -#define _set_pac(pac, l, c) ((pac)[(l)>>2] |= (c)<<((~(l)&3)<<1)) -#define _get_pac(pac, l) ((pac)[(l)>>2]>>((~(l)&3)<<1)&3) - -static uint8_t *add1(const kseq_t *seq, bntseq_t *bns, uint8_t *pac, int64_t *m_pac, int *m_seqs, int *m_holes, bntamb1_t **q) -{ - bntann1_t *p; - int i, lasts; - if (bns->n_seqs == *m_seqs) { - *m_seqs <<= 1; - bns->anns = (bntann1_t*)realloc(bns->anns, *m_seqs * sizeof(bntann1_t)); - } - p = bns->anns + bns->n_seqs; - p->name = strdup((char*)seq->name.s); - p->anno = seq->comment.s? strdup((char*)seq->comment.s) : strdup("(null)"); - p->gi = 0; p->len = seq->seq.l; - p->offset = (bns->n_seqs == 0)? 0 : (p-1)->offset + (p-1)->len; - p->n_ambs = 0; - for (i = lasts = 0; i < seq->seq.l; ++i) { - int c = nst_nt4_table[(int)seq->seq.s[i]]; - if (c >= 4) { // N - if (lasts == seq->seq.s[i]) { // contiguous N - ++(*q)->len; - } else { - if (bns->n_holes == *m_holes) { - (*m_holes) <<= 1; - bns->ambs = (bntamb1_t*)realloc(bns->ambs, (*m_holes) * sizeof(bntamb1_t)); - } - *q = bns->ambs + bns->n_holes; - (*q)->len = 1; - (*q)->offset = p->offset + i; - (*q)->amb = seq->seq.s[i]; - ++p->n_ambs; - ++bns->n_holes; - } - } - lasts = seq->seq.s[i]; - { // fill buffer - if (c >= 4) c = lrand48()&3; - if (bns->l_pac == *m_pac) { // double the pac size - *m_pac <<= 1; - pac = realloc(pac, *m_pac/4); - memset(pac + bns->l_pac/4, 0, (*m_pac - bns->l_pac)/4); - } - _set_pac(pac, bns->l_pac, c); - ++bns->l_pac; - } - } - ++bns->n_seqs; - return pac; -} - -int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only) -{ - extern void seq_reverse(int len, ubyte_t *seq, int is_comp); // in bwaseqio.c - kseq_t *seq; - char name[1024]; - bntseq_t *bns; - uint8_t *pac = 0; - int32_t m_seqs, m_holes; - int64_t ret = -1, m_pac, l; - bntamb1_t *q; - FILE *fp; - - // initialization - seq = kseq_init(fp_fa); - bns = (bntseq_t*)calloc(1, sizeof(bntseq_t)); - bns->seed = 11; // fixed seed for random generator - srand48(bns->seed); - m_seqs = m_holes = 8; m_pac = 0x10000; - bns->anns = (bntann1_t*)calloc(m_seqs, sizeof(bntann1_t)); - bns->ambs = (bntamb1_t*)calloc(m_holes, sizeof(bntamb1_t)); - pac = calloc(m_pac/4, 1); - q = bns->ambs; - strcpy(name, prefix); strcat(name, ".pac"); - fp = xopen(name, "wb"); - // read sequences - while (kseq_read(seq) >= 0) pac = add1(seq, bns, pac, &m_pac, &m_seqs, &m_holes, &q); - if (!for_only) { // add the reverse complemented sequence - m_pac = (bns->l_pac * 2 + 3) / 4 * 4; - pac = realloc(pac, m_pac/4); - memset(pac + (bns->l_pac+3)/4, 0, (m_pac - (bns->l_pac+3)/4*4) / 4); - for (l = bns->l_pac - 1; l >= 0; --l, ++bns->l_pac) - _set_pac(pac, bns->l_pac, 3-_get_pac(pac, l)); - } - ret = bns->l_pac; - { // finalize .pac file - ubyte_t ct; - err_fwrite(pac, 1, (bns->l_pac>>2) + ((bns->l_pac&3) == 0? 0 : 1), fp); - // the following codes make the pac file size always (l_pac/4+1+1) - if (bns->l_pac % 4 == 0) { - ct = 0; - err_fwrite(&ct, 1, 1, fp); - } - ct = bns->l_pac % 4; - err_fwrite(&ct, 1, 1, fp); - // close .pac file - err_fflush(fp); - err_fclose(fp); - } - bns_dump(bns, prefix); - bns_destroy(bns); - kseq_destroy(seq); - free(pac); - return ret; -} - -int bwa_fa2pac(int argc, char *argv[]) -{ - int c, for_only = 0; - gzFile fp; - while ((c = getopt(argc, argv, "f")) >= 0) { - switch (c) { - case 'f': for_only = 1; break; - } - } - if (argc == optind) { - fprintf(stderr, "Usage: bwa fa2pac [-f] []\n"); - return 1; - } - fp = xzopen(argv[optind], "r"); - bns_fasta2bntseq(fp, (optind+1 < argc)? argv[optind+1] : argv[optind], for_only); - err_gzclose(fp); - return 0; -} - -int bns_pos2rid(const bntseq_t *bns, int64_t pos_f) -{ - int left, mid, right; - if (pos_f >= bns->l_pac) return -1; - left = 0; mid = 0; right = bns->n_seqs; - while (left < right) { // binary search - mid = (left + right) >> 1; - if (pos_f >= bns->anns[mid].offset) { - if (mid == bns->n_seqs - 1) break; - if (pos_f < bns->anns[mid+1].offset) break; // bracketed - left = mid + 1; - } else right = mid; - } - return mid; -} - -int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re) -{ - int is_rev, rid_b, rid_e; - if (rb < bns->l_pac && re > bns->l_pac) return -2; - rid_b = bns_pos2rid(bns, bns_depos(bns, rb, &is_rev)); - rid_e = bns_pos2rid(bns, bns_depos(bns, re, &is_rev) - 1); - return rid_b == rid_e? rid_b : -1; -} - -int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id) -{ - int left, mid, right, nn; - if (ref_id) *ref_id = bns_pos2rid(bns, pos_f); - left = 0; right = bns->n_holes; nn = 0; - while (left < right) { - mid = (left + right) >> 1; - if (pos_f >= bns->ambs[mid].offset + bns->ambs[mid].len) left = mid + 1; - else if (pos_f + len <= bns->ambs[mid].offset) right = mid; - else { // overlap - if (pos_f >= bns->ambs[mid].offset) { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? - bns->ambs[mid].offset + bns->ambs[mid].len - pos_f : len; - } else { - nn += bns->ambs[mid].offset + bns->ambs[mid].len < pos_f + len? - bns->ambs[mid].len : len - (bns->ambs[mid].offset - pos_f); - } - break; - } - } - return nn; -} - -uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len) -{ - uint8_t *seq = 0; - if (end < beg) end ^= beg, beg ^= end, end ^= beg; // if end is smaller, swap - if (end > l_pac<<1) end = l_pac<<1; - if (beg < 0) beg = 0; - if (beg >= l_pac || end <= l_pac) { - int64_t k, l = 0; - *len = end - beg; - seq = malloc(end - beg); - if (beg >= l_pac) { // reverse strand - int64_t beg_f = (l_pac<<1) - 1 - end; - int64_t end_f = (l_pac<<1) - 1 - beg; - for (k = end_f; k > beg_f; --k) - seq[l++] = 3 - _get_pac(pac, k); - } else { // forward strand - for (k = beg; k < end; ++k) - seq[l++] = _get_pac(pac, k); - } - } else *len = 0; // if bridging the forward-reverse boundary, return nothing - return seq; -} - -uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid) -{ - int64_t far_beg, far_end, len; - int is_rev; - uint8_t *seq; - - if (*end < *beg) *end ^= *beg, *beg ^= *end, *end ^= *beg; // if end is smaller, swap - assert(*beg <= mid && mid < *end); - *rid = bns_pos2rid(bns, bns_depos(bns, mid, &is_rev)); - far_beg = bns->anns[*rid].offset; - far_end = far_beg + bns->anns[*rid].len; - if (is_rev) { // flip to the reverse strand - int64_t tmp = far_beg; - far_beg = (bns->l_pac<<1) - far_end; - far_end = (bns->l_pac<<1) - tmp; - } - *beg = *beg > far_beg? *beg : far_beg; - *end = *end < far_end? *end : far_end; - seq = bns_get_seq(bns->l_pac, pac, *beg, *end, &len); - if (seq == 0 || *end - *beg != len) { - fprintf(stderr, "[E::%s] begin=%ld, mid=%ld, end=%ld, len=%ld, seq=%p, rid=%d, far_beg=%ld, far_end=%ld\n", - __func__, (long)*beg, (long)mid, (long)*end, (long)len, seq, *rid, (long)far_beg, (long)far_end); - } - assert(seq && *end - *beg == len); // assertion failure should never happen - return seq; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bntseq.h --- a/bwa-0.7.9a/bntseq.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,91 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef BWT_BNTSEQ_H -#define BWT_BNTSEQ_H - -#include -#include -#include -#include - -#ifndef BWA_UBYTE -#define BWA_UBYTE -typedef uint8_t ubyte_t; -#endif - -typedef struct { - int64_t offset; - int32_t len; - int32_t n_ambs; - uint32_t gi; - char *name, *anno; -} bntann1_t; - -typedef struct { - int64_t offset; - int32_t len; - char amb; -} bntamb1_t; - -typedef struct { - int64_t l_pac; - int32_t n_seqs; - uint32_t seed; - bntann1_t *anns; // n_seqs elements - int32_t n_holes; - bntamb1_t *ambs; // n_holes elements - FILE *fp_pac; -} bntseq_t; - -extern unsigned char nst_nt4_table[256]; - -#ifdef __cplusplus -extern "C" { -#endif - - void bns_dump(const bntseq_t *bns, const char *prefix); - bntseq_t *bns_restore(const char *prefix); - bntseq_t *bns_restore_core(const char *ann_filename, const char* amb_filename, const char* pac_filename); - void bns_destroy(bntseq_t *bns); - int64_t bns_fasta2bntseq(gzFile fp_fa, const char *prefix, int for_only); - int bns_pos2rid(const bntseq_t *bns, int64_t pos_f); - int bns_cnt_ambi(const bntseq_t *bns, int64_t pos_f, int len, int *ref_id); - uint8_t *bns_get_seq(int64_t l_pac, const uint8_t *pac, int64_t beg, int64_t end, int64_t *len); - uint8_t *bns_fetch_seq(const bntseq_t *bns, const uint8_t *pac, int64_t *beg, int64_t mid, int64_t *end, int *rid); - int bns_intv2rid(const bntseq_t *bns, int64_t rb, int64_t re); - -#ifdef __cplusplus -} -#endif - -static inline int64_t bns_depos(const bntseq_t *bns, int64_t pos, int *is_rev) -{ - return (*is_rev = (pos >= bns->l_pac))? (bns->l_pac<<1) - 1 - pos : pos; -} - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwa-helper.js --- a/bwa-0.7.9a/bwa-helper.js Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,706 +0,0 @@ -/***************************************************************** - * The K8 Javascript interpreter is required to run this script. * - * * - * Source code: https://github.com/attractivechaos/k8 * - * Binary: http://sourceforge.net/projects/lh3/files/k8/ * - * * - * Data file used for generating GRCh38 ALT alignments: * - * * - * http://sourceforge.net/projects/bio-bwa/files/ * - *****************************************************************/ - -/****************** - *** From k8.js *** - ******************/ - -var getopt = function(args, ostr) { - var oli; // option letter list index - if (typeof(getopt.place) == 'undefined') - getopt.ind = 0, getopt.arg = null, getopt.place = -1; - if (getopt.place == -1) { // update scanning pointer - if (getopt.ind >= args.length || args[getopt.ind].charAt(getopt.place = 0) != '-') { - getopt.place = -1; - return null; - } - if (getopt.place + 1 < args[getopt.ind].length && args[getopt.ind].charAt(++getopt.place) == '-') { // found "--" - ++getopt.ind; - getopt.place = -1; - return null; - } - } - var optopt = args[getopt.ind].charAt(getopt.place++); // character checked for validity - if (optopt == ':' || (oli = ostr.indexOf(optopt)) < 0) { - if (optopt == '-') return null; // if the user didn't specify '-' as an option, assume it means null. - if (getopt.place < 0) ++getopt.ind; - return '?'; - } - if (oli+1 >= ostr.length || ostr.charAt(++oli) != ':') { // don't need argument - getopt.arg = null; - if (getopt.place < 0 || getopt.place >= args[getopt.ind].length) ++getopt.ind, getopt.place = -1; - } else { // need an argument - if (getopt.place >= 0 && getopt.place < args[getopt.ind].length) - getopt.arg = args[getopt.ind].substr(getopt.place); - else if (args.length <= ++getopt.ind) { // no arg - getopt.place = -1; - if (ostr.length > 0 && ostr.charAt(0) == ':') return ':'; - return '?'; - } else getopt.arg = args[getopt.ind]; // white space - getopt.place = -1; - ++getopt.ind; - } - return optopt; -} - -function obj2str(o) -{ - if (typeof(o) != 'object') { - return o.toString(); - } else if (o == null) { - return "null"; - } else if (Array.isArray(o)) { - var s = "["; - for (var i = 0; i < o.length; ++i) { - if (i) s += ','; - s += obj2str(o[i]); - } - return s + "]"; - } else { - var i = 0, s = "{"; - for (var key in o) { - if (i++) s += ','; - s += key + ":"; - s += obj2str(o[key]); - } - return s + "}"; - } -} - -Bytes.prototype.reverse = function() -{ - for (var i = 0; i < this.length>>1; ++i) { - var tmp = this[i]; - this[i] = this[this.length - i - 1]; - this[this.length - i - 1] = tmp; - } -} - -Bytes.prototype.revcomp = function() -{ - if (Bytes.rctab == null) { - var s1 = 'WSATUGCYRKMBDHVNwsatugcyrkmbdhvn'; - var s2 = 'WSTAACGRYMKVHDBNwstaacgrymkvhdbn'; - Bytes.rctab = []; - for (var i = 0; i < 256; ++i) Bytes.rctab[i] = 0; - for (var i = 0; i < s1.length; ++i) - Bytes.rctab[s1.charCodeAt(i)] = s2.charCodeAt(i); - } - for (var i = 0; i < this.length>>1; ++i) { - var tmp = this[this.length - i - 1]; - this[this.length - i - 1] = Bytes.rctab[this[i]]; - this[i] = Bytes.rctab[tmp]; - } - if (this.length>>1) - this[this.length>>1] = Bytes.rctab[this[this.length>>1]]; -} - -/************************ - *** command markovlp *** - ************************/ - -function bwa_markOvlp(args) -{ - var c, min_aln_ratio = .9, min_ext = 50; - while ((c = getopt(args, "r:e:")) != null) { - if (c == 'r') min_aln_ratio = parseFloat(getopt.arg); - else if (c == 'e') min_ext = parseInt(getopt.arg); - } - - var file = args.length == getopt.ind? new File() : new File(args[getopt.ind]); - var buf = new Bytes(); - var dir4 = ['>>', '><', '<>', '<<']; - - while (file.readline(buf) >= 0) { - var t = buf.toString().split("\t") - for (var i = 0; i < t.length; ++i) - if (i != 0 && i != 4) - t[i] = parseInt(t[i]); - var el, a1, a2, e1, e2, r1, r2; // a: aligned length; e: extended length; r: remaining length - e2 = a2 = t[7] - t[6]; - if (t[2] < t[3]) { // forward-forward match - e1 = a1 = t[3] - t[2]; - r1 = t[2] - t[6]; - r2 = (t[5] - t[7]) - (t[1] - t[3]); - el = r1 > 0? t[6] : t[2]; - el += r2 > 0? t[1] - t[3] : t[5] - t[7]; - } else { // reverse-forward match - e1 = a1 = t[2] - t[3]; - r1 = (t[1] - t[2]) - t[6]; - r2 = (t[5] - t[7]) - t[3]; - el = r1 > 0? t[6] : t[1] - t[2]; - el += r2 > 0? t[3] : t[5] - t[7]; - } - e1 += el; e2 += el; - var type; - if (a1 / e1 >= min_aln_ratio && a2 / e2 >= min_aln_ratio) { - if ((r1 >= min_ext && r2 >= min_ext) || (r1 <= -min_ext && r2 <= -min_ext)) { // suffix-prefix match - var d = t[2] < t[3]? 0 : 2; - if (r1 < 0) d ^= 3; // reverse the direction - type = 'O' + dir4[d]; - } else type = 'C' + (e1 / t[1] > e2 / t[5]? 1 : 2); - } else type = 'I'; // internal local match; not a suffix-prefix match - //print(t[1], e1, a1, t[5], e2, a2); - print(type, buf); - } - - buf.destroy(); - file.close(); -} - -/*********************** - *** command pas2bed *** - ***********************/ - -function bwa_pas2reg(args) -{ - var file = args.length? new File(args[0]) : new File(); - var buf = new Bytes(); - - while (file.readline(buf) >= 0) { - var t = buf.toString().split("\t"); - if (t[0] == t[4]) continue; - if (parseInt(t[2]) < parseInt(t[3])) print(t[0], t[1], t[2], t[3], t[8]); - else print(t[0], t[1], t[3], t[2], t[8]); - print(t[4], t[5], t[6], t[7], t[8]); - } - - buf.destroy(); - file.close(); -} - -/******************* - * command sam2pas * - *******************/ - -function bwa_sam2pas(args) -{ - var file = args.length == 0? new File() : new File(args[0]); - var buf = new Bytes(); - var seq_dict = {}; - - while (file.readline(buf) >= 0) { - var line = buf.toString(); - var m; - if (/^@SQ/.test(line)) { - var name = null, len = null; - if ((m = /\tSN:(\S+)/.exec(line)) != null) name = m[1]; - if ((m = /\tLN:(\S+)/.exec(line)) != null) len = parseInt(m[1]); - if (name != null && len != null) seq_dict[name] = len; - } - if (/^@/.test(line)) continue; - var t = line.split("\t"); - var pos = parseInt(t[3]) - 1; - var x = 0, y = 0, i = 0, clip = [0, 0], n_ins = 0, n_del = 0, o_ins = 0, o_del = 0, n_M = 0; - var re = /(\d+)([MIDSH])/g; - while ((m = re.exec(t[5])) != null) { - var l = parseInt(m[1]); - if (m[2] == 'M') x += l, y += l, n_M += l; - else if (m[2] == 'I') y += l, n_ins += l, ++o_ins; - else if (m[2] == 'D') x += l, n_del += l, ++o_del; - else if (m[2] == 'S' || m[2] == 'H') - clip[i == 0? 0 : 1] = l; - ++i; - } - var is_rev = (parseInt(t[1]) & 16)? true : false; - var misc = 'mapQ=' + t[4] + ';'; - var usc = 1; - if ((m = /\tNM:i:(\d+)/.exec(line)) != null) { - var NM = parseInt(m[1]); - var diff = (NM / (n_M + n_ins + n_del)).toFixed(3); - misc += 'diff=' + diff + ';n_mis=' + (NM - n_del - n_ins) + ';'; - } - misc += 'n_del='+n_del+';n_ins='+n_ins+';o_del='+o_del+';o_ins='+o_ins + ';'; - if ((m = /\tAS:i:(\d+)/.exec(line)) != null) { - misc += 'AS='+m[1] + ';'; - usc = (parseInt(m[1]) / (x > y? x : y)).toFixed(3); - } - if ((m = /\tXS:i:(\d+)/.exec(line)) != null) misc += 'XS='+m[1] + ';'; - var len = y + clip[0] + clip[1]; - var z = [t[0], len, clip[0], clip[0] + y, t[2], seq_dict[t[2]], pos, pos + x, usc, misc]; - if (parseInt(t[1]) & 16) z[2] = clip[1] + y, z[3] = clip[1]; - print(z.join("\t")); - } - - buf.destroy(); - file.close(); -} - -/*********************** - *** command reg2cut *** - ***********************/ - -function bwa_reg2cut(args) -{ - var c, min_usc = 0.5, min_ext = 100, min_len = 5000, cut = 250; - while ((c = getopt(args, "s:e:l:c:")) != null) { - if (c == 's') min_usc = parseFloat(getopt.arg); - else if (c == 'e') min_ext = parseInt(getopt.arg); - else if (c == 'l') min_len = parseInt(getopt.arg); - else if (c == 'c') cut = parseInt(getopt.arg); - } - - var file = args.length == getopt.ind? new File () : new File(args[getopt.ind]); - var buf = new Bytes(); - - function print_bed() { - for (var i = 0; i < a.length; ++i) { - var start = a[i][0] - cut > 0? a[i][0] : 0; - var end = a[i][1] + cut < last_len? a[i][1] : last_len; - if (end - start >= min_len) print(last_chr, start, end); - } - } - - var last_chr = null, last_len = null, max_c_usc = 0, start = 0, end = 0; - var a = []; - while (file.readline(buf) >= 0) { - var t = buf.toString().split("\t"); - t[1] = parseInt(t[1]); - t[2] = parseInt(t[2]); - t[3] = parseInt(t[3]); - t[4] = parseFloat(t[4]); - var is_contained = t[2] < min_ext && t[1] - t[3] < min_ext? true : false; - if (t[3] - t[2] < cut<<1) continue; - t[2] += cut; t[3] -= cut; - if (t[0] != last_chr) { - a.push([start, end]); - if (last_chr != null && max_c_usc < min_usc) print_bed(); - last_chr = t[0]; last_len = t[1]; start = t[2]; end = t[3]; - max_c_usc = is_contained? t[4] : 0; - a.length = 0; - } else { - if (is_contained) - max_c_usc = max_c_usc > t[4]? max_c_usc : t[4]; - if (t[4] < min_usc) continue; - if (t[2] > end) { - a.push([start, end]); - start = t[2]; - end = end > t[3]? end : t[3]; - } else end = end > t[3]? end : t[3]; - } - } - a.push([start, end]); - if (max_c_usc < min_usc) print_bed(); // the last sequence - - buf.destroy(); - file.close(); -} - -function bwa_shortname(args) -{ - var file = args.length? new File(args[0]) : new File(); - var buf = new Bytes(); - - var re = /(\S+)\/(\d+)_(\d+)((:\d+-\d+)+)/g; - var re2 = /:(\d+)-(\d+)/g; - while (file.readline(buf) >= 0) { - var match, match2; - var line = buf.toString(); - var x = []; - while ((match = re.exec(line)) != null) { - var start = parseInt(match[2]), len = parseInt(match[3]) - start; - while ((match2 = re2.exec(match[4])) != null) { - var a = parseInt(match2[1]) - 1; - var b = parseInt(match2[2]); - start += a; len = b - a; - } - x.push([match[0], match[1] + '/' + start.toString() + '_' + (start+len).toString()]); - } - for (var i = 0; i < x.length; ++i) - line = line.replace(x[i][0], x[i][1]); - print(line); - } - - buf.destroy(); - file.close(); -} - -/******************* - * Command gff2sam * - *******************/ - -function bwa_gff2sam(args) -{ - if (args.length < 2) { - print("Usage: k8 bwa-helper.js "); - exit(1); - } - - var file = new File(args[1]); - var buf = new Bytes(); - var len = {}; - - while (file.readline(buf) >= 0) { - var t = buf.toString().split(/\s+/); - len[t[0]] = parseInt(t[1]); - } - file.close(); - - file = new File(args[0]); - var re_cigar = /([MID])(\d+)/g; - var lineno = 0; - while (file.readline(buf) >= 0) { - ++lineno; - var t = buf.toString().split("\t"); - var m = /Target=(\S+)\s+(\d+)\s+(\d+)\s+([+-])/.exec(t[8]); - if (m == null) { - warn("WARNING: skipped line "+lineno+" due to the lack of Target."); - continue; - } - var qname = m[1]; - var flag = t[6] == m[4]? 0 : 16; - var qb = parseInt(m[2]) - 1, qe = parseInt(m[3]), qlen = len[qname]; - if (qlen == null) - throw Error("Sequence "+qname+" is not present in the query-length.txt"); - var clip5 = qb, clip3 = qlen - qe; - if (flag&16) clip5 ^= clip3, clip3 ^= clip5, clip5 ^= clip3; // swap - - m = /Gap\s*=\s*(([MID]\d+\s*)+)/.exec(t[8]); - var cigar = clip5? clip5 + 'S' : ''; - var n_ins = 0, n_del = 0, n_match = 0, NM = null; - if (m) { - var mc; - while ((mc = re_cigar.exec(m[1])) != null) { - var l = parseInt(mc[2]); - cigar += mc[2] + mc[1]; - if (mc[1] == 'I') n_ins += l; - else if (mc[1] == 'D') n_del += l; - else if (mc[1] == 'M') n_match += l; - } - if (n_ins + n_match != qe - qb || n_del + n_match != parseInt(t[4]) - parseInt(t[3]) + 1) - throw Error("Inconsistent CIGAR at line "+lineno); - } else { // ungapped alignment - var tb = parseInt(t[3]) - 1, te = parseInt(t[4]); - if (te - tb != qe - qb) { - warn("WARNING: line "+lineno+" should contain gaps, but lacks Gap. Skipped.\n"); - } else cigar = (qe - qb) + 'M'; - } - if (clip3) cigar += clip3 + 'S'; - if ((m = /num_mismatch=(\d+)/.exec(t[8])) != null) - NM = parseInt(m[1]) + n_ins + n_del; - var out = [qname, flag, t[0], t[3], 255, cigar, '*', 0, 0, '*', '*']; - if (NM != null) out.push('NM:i:' + NM); - print(out.join("\t")); - } - buf.destroy(); - file.close(); -} - - -/****************************** - *** Generate ALT alignment *** - ******************************/ - -function intv_ovlp(intv, bits) // interval index -{ - if (typeof bits == "undefined") bits = 13; - intv.sort(function(a,b) {return a[0]-b[0];}); - // create the index - var idx = [], max = 0; - for (var i = 0; i < intv.length; ++i) { - var b = intv[i][0]>>bits; - var e = (intv[i][1]-1)>>bits; - if (b != e) { - for (var j = b; j <= e; ++j) - if (idx[j] == null) idx[j] = i; - } else if (idx[b] == null) idx[b] = i; - max = max > e? max : e; - } - // closure - return function(_b, _e) { - var x = _b >> bits; - if (x > max) return []; - var off = idx[x]; - if (off == null) { - var i; - for (i = ((_e - 1) >> bits) - 1; i >= 0; --i) - if (idx[i] != null) break; - off = i < 0? 0 : idx[i]; - } - var ovlp = []; - for (var i = off; i < intv.length && intv[i][0] < _e; ++i) - if (intv[i][1] > _b) ovlp.push(intv[i]); - return ovlp; - } -} - -function bwa_genalt(args) -{ - var re_cigar = /(\d+)([MIDSHN])/g; - - function cigar2pos(cigar, pos) // given a pos on ALT and the ALT-to-REF CIGAR, find the pos on REF - { - var x = 0, y = 0; - for (var i = 0; i < cigar.length; ++i) { - var op = cigar[i][0], len = cigar[i][1]; - if (op == 'M') { - if (y <= pos && pos < y + len) - return x + (pos - y); - x += len, y += len; - } else if (op == 'D') { - x += len; - } else if (op == 'I') { - if (y <= pos && pos < y + len) - return x; - y += len; - } else if (op == 'S' || op == 'H') { - if (y <= pos && pos < y + len) - return -1; - y += len; - } - } - return -1; - } - - function parse_hit(s, opt) // parse a hit. s looks something like ["chr1", "+12345", "100M", 5] - { - var h = {}; - h.ctg = s[0]; - h.start = parseInt(s[1].substr(1)) - 1; - h.rev = (s[1].charAt(0) == '-'); - h.cigar = s[2]; - h.NM = parseInt(s[3]); - var m, l_ins, n_ins, l_del, n_del, l_match, l_skip, l_clip; - l_ins = l_del = n_ins = n_del = l_match = l_skip = l_clip = 0; - while ((m = re_cigar.exec(h.cigar)) != null) { - var l = parseInt(m[1]); - if (m[2] == 'M') l_match += l; - else if (m[2] == 'D') ++n_del, l_del += l; - else if (m[2] == 'I') ++n_ins, l_ins += l; - else if (m[2] == 'N') l_skip += l; - else if (m[2] == 'H' || m[2] == 'S') l_clip += l; - } - h.end = h.start + l_match + l_del + l_skip; - h.NM = h.NM > l_del + l_ins? h.NM : l_del + l_ins; - h.score = Math.floor((opt.a * l_match - (opt.a + opt.b) * (h.NM - l_del - l_ins) - opt.o * (n_del + n_ins) - opt.e * (l_del + l_ins)) / opt.a + .499); - h.l_query = l_match + l_ins + l_clip; - return h; - } - - var c, opt = { a:1, b:4, o:6, e:1, verbose:3 }; - - while ((c = getopt(args, 'v:')) != null) { - if (c == 'v') opt.verbose = parseInt(getopt.arg); - } - - if (args.length == getopt.ind) { - print("Usage: k8 bwa-helper.js genalt [aln.sam]"); - exit(1); - } - - var file, buf = new Bytes(); - var aux = new Bytes(); // used for reverse and reverse complement - - // read the ALT-to-REF alignment and generate the index - var intv = {}; - file = new File(args[getopt.ind]); - while (file.readline(buf) >= 0) { - var line = buf.toString(); - var t = line.split("\t"); - if (line.charAt(0) == '@') continue; - var flag = parseInt(t[1]); - var m, cigar = [], l_qaln = 0, l_qclip = 0; - while ((m = re_cigar.exec(t[5])) != null) { - var l = parseInt(m[1]); - cigar.push([m[2] != 'H'? m[2] : 'S', l]); // convert hard clip to soft clip - if (m[2] == 'M' || m[2] == 'I') l_qaln += l; - else if (m[2] == 'S' || m[2] == 'H') l_qclip += l; - } - var j = flag&16? cigar.length-1 : 0; - var start = cigar[j][0] == 'S'? cigar[j][1] : 0; - if (intv[t[0]] == null) intv[t[0]] = []; - intv[t[0]].push([start, start + l_qaln, l_qaln + l_qclip, t[2], flag&16? true : false, parseInt(t[3]) - 1, cigar]); - //print(start, start + l_qaln, t[2], flag&16? true : false, parseInt(t[3]), cigar); - } - file.close(); - // create the interval index - var idx = {}; - for (var ctg in intv) - idx[ctg] = intv_ovlp(intv[ctg]); - - // process SAM - file = args.length - getopt.ind >= 2? new File(args[getopt.ind+1]) : new File(); - while (file.readline(buf) >= 0) { - var m, line = buf.toString(); - if (line.charAt(0) == '@' || (m = /\tXA:Z:(\S+)/.exec(line)) == null) { // TODO: this does not work with PE file - if (opt.verbose < 4) print(line); - continue; - } - - // parse hits - var hits = []; - var XA_strs = m[1].split(";"); - var NM = (m = /\tNM:i:(\d+)/.exec(line)) == null? '0' : m[1]; - var t = line.split("\t"); - var flag = parseInt(t[1]); - hits.push(parse_hit([t[2], ((flag&16)?'-':'+') + t[3], t[5], NM], opt)); - for (var i = 0; i < XA_strs.length; ++i) // hits in the XA tag - if (XA_strs[i] != '') // as the last symbol in an XA tag is ";", the last split is an empty string - hits.push(parse_hit(XA_strs[i].split(","), opt)); - - // lift mapping positions to coordinates on the primary assembly - var n_lifted = 0; - for (var i = 0; i < hits.length; ++i) { - var h = hits[i]; - - if (idx[h.ctg] == null) continue; - var a = idx[h.ctg](h.start, h.end); - if (a == null || a.length == 0) continue; - - // find the approximate position on the primary assembly - var lifted = []; - for (var j = 0; j < a.length; ++j) { - var s, e; - if (!a[j][4]) { // ALT is mapped to the forward strand of the primary assembly - s = cigar2pos(a[j][6], h.start); - e = cigar2pos(a[j][6], h.end - 1) + 1; - } else { - s = cigar2pos(a[j][6], a[j][2] - h.end); - e = cigar2pos(a[j][6], a[j][2] - h.start - 1) + 1; - } - if (s < 0 || e < 0) continue; // read is mapped to clippings in the ALT-to-chr alignment - s += a[j][5]; e += a[j][5]; - lifted.push([a[j][3], (h.rev!=a[j][4]), s, e]); - } - if (lifted.length) ++n_lifted, hits[i].lifted = lifted; - } - if (n_lifted == 0) { - if (opt.verbose < 4) print(line); - continue; - } - - // group hits - for (var i = 0; i < hits.length; ++i) { // set keys for sorting - if (hits[i].lifted && hits[i].lifted.length) // TODO: only the first element in lifted[] is used - hits[i].pctg = hits[i].lifted[0][0], hits[i].pstart = hits[i].lifted[0][2], hits[i].pend = hits[i].lifted[0][3]; - else hits[i].pctg = hits[i].ctg, hits[i].pstart = hits[i].start, hits[i].pend = hits[i].end; - hits[i].i = i; // keep the original index - } - hits.sort(function(a,b) { return a.pctg != b.pctg? (a.pctg < b.pctg? -1 : 1) : a.pstart - b.pstart }); - var last_chr = null, end = 0, g = -1; - for (var i = 0; i < hits.length; ++i) { - if (last_chr != hits[i].pctg) ++g, last_chr = hits[i].pctg, end = 0; - else if (hits[i].pstart >= end) ++g; - hits[i].g = g; - end = end > hits[i].pend? end : hits[i].pend; - } - var reported_g = null, reported_i = null; - for (var i = 0; i < hits.length; ++i) - if (hits[i].i == 0) - reported_g = hits[i].g, reported_i = i; - var n_group0 = 0; // #hits overlapping the reported hit - for (var i = 0; i < hits.length; ++i) - if (hits[i].g == reported_g) - ++n_group0; - if (n_group0 == 1) { // then keep the reported alignment and mapQ - if (opt.verbose < 4) print(line); - continue; - } - - // re-estimate mapQ - var group_max = []; - for (var i = 0; i < hits.length; ++i) { - var g = hits[i].g; - if (group_max[g] == null || group_max[g][0] < hits[i].score) - group_max[g] = [hits[i].score, g]; - } - if (group_max.length > 1) - group_max.sort(function(x,y) {return y[0]-x[0]}); - var mapQ; - if (group_max[0][1] == reported_g) { // the best hit is the hit reported in SAM - mapQ = group_max.length == 1? 60 : 6 * (group_max[0][0] - group_max[1][0]); - } else mapQ = 0; - mapQ = mapQ < 60? mapQ : 60; - var ori_mapQ = parseInt(t[4]); - mapQ = mapQ > ori_mapQ? mapQ : ori_mapQ; - - // generate lifted_str - for (var i = 0; i < hits.length; ++i) { - if (hits[i].lifted && hits[i].lifted.length) { - var lifted = hits[i].lifted; - var u = ''; - for (var j = 0; j < lifted.length; ++j) - u += lifted[j][0] + "," + lifted[j][2] + "," + lifted[j][3] + "," + (lifted[j][1]?'-':'+') + ";"; - hits[i].lifted_str = u; - } - } - - // generate reversed quality and reverse-complemented sequence if necessary - var rs = null, rq = null; // reversed quality and reverse complement sequence - var need_rev = false; - for (var i = 0; i < hits.length; ++i) { - if (hits[i].g != reported_g || i == reported_i) continue; - if (hits[i].rev != hits[reported_i].rev) - need_rev = true; - } - if (need_rev) { // reverse and reverse complement - aux.set(t[9], 0); aux.revcomp(); rs = aux.toString(); - aux.set(t[10],0); aux.reverse(); rq = aux.toString(); - } - - // print - t[4] = mapQ; - t.push("om:i:"+ori_mapQ); - if (hits[reported_i].lifted_str) t.push("lt:Z:" + hits[reported_i].lifted_str); - print(t.join("\t")); - var cnt = 0; - for (var i = 0; i < hits.length; ++i) { - if (opt.verbose >= 5) print(obj2str(hits[i])); - if (hits[i].g != reported_g || i == reported_i) continue; - var s = [t[0], flag&0xf10, hits[i].ctg, hits[i].start+1, mapQ, hits[i].cigar, '*', 0, 0]; - // update name - if (flag&0x40) s[0] += "/1"; - if (flag&0x80) s[0] += "/2"; - s[0] += "_" + (++cnt); - if (hits[i].rev == hits[reported_i].rev) s.push(t[9], t[10]); - else s.push(rs, rq); - s.push("NM:i:" + hits[i].NM); - if (hits[i].lifted_str) s.push("lt:Z:" + hits[i].lifted_str); - print(s.join("\t")); - } - } - file.close(); - - aux.destroy(); - buf.destroy(); -} - -/********************* - *** Main function *** - *********************/ - -function main(args) -{ - if (args.length == 0) { - print("\nUsage: k8 bwa-helper.js [arguments]\n"); - print("Commands: genalt generate ALT alignments"); - print(" sam2pas convert SAM to pairwise alignment summary format (PAS)"); - print(" pas2reg extract covered regions"); - print(" reg2cut regions to extract for the 2nd round bwa-mem"); - print(" markovlp identify bi-directional overlaps"); - print(" gff2sam convert GFF3 alignment to SAM"); - print(" shortname shorten sequence name after subseq (PacBio read names only)"); - print(""); - exit(1); - } - - var cmd = args.shift(); - if (cmd == 'sam2pas') bwa_sam2pas(args); - else if (cmd == 'gff2sam') bwa_gff2sam(args); - else if (cmd == 'markovlp') bwa_markOvlp(args); - else if (cmd == 'pas2reg') bwa_pas2reg(args); - else if (cmd == 'reg2cut') bwa_reg2cut(args); - else if (cmd == 'genalt') bwa_genalt(args); - else if (cmd == 'shortname') bwa_shortname(args); - else warn("Unrecognized command"); -} - -main(arguments); diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwa.1 --- a/bwa-0.7.9a/bwa.1 Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,782 +0,0 @@ -.TH bwa 1 "19 May 2014" "bwa-0.7.9-r783" "Bioinformatics tools" -.SH NAME -.PP -bwa - Burrows-Wheeler Alignment Tool -.SH SYNOPSIS -.PP -bwa index ref.fa -.PP -bwa mem ref.fa reads.fq > aln-se.sam -.PP -bwa mem ref.fa read1.fq read2.fq > aln-pe.sam -.PP -bwa aln ref.fa short_read.fq > aln_sa.sai -.PP -bwa samse ref.fa aln_sa.sai short_read.fq > aln-se.sam -.PP -bwa sampe ref.fa aln_sa1.sai aln_sa2.sai read1.fq read2.fq > aln-pe.sam -.PP -bwa bwasw ref.fa long_read.fq > aln.sam - -.SH DESCRIPTION -.PP -BWA is a software package for mapping low-divergent sequences against a large -reference genome, such as the human genome. It consists of three algorithms: -BWA-backtrack, BWA-SW and BWA-MEM. The first algorithm is designed for Illumina -sequence reads up to 100bp, while the rest two for longer sequences ranged from -70bp to 1Mbp. BWA-MEM and BWA-SW share similar features such as long-read -support and split alignment, but BWA-MEM, which is the latest, is generally -recommended for high-quality queries as it is faster and more accurate. -BWA-MEM also has better performance than BWA-backtrack for 70-100bp Illumina -reads. - -For all the algorithms, BWA first needs to construct the FM-index for -the reference genome (the -.B index -command). Alignment algorithms are invoked with different sub-commands: -.BR aln / samse / sampe -for BWA-backtrack, -.B bwasw -for BWA-SW and -.B mem -for the BWA-MEM algorithm. - -.SH COMMANDS AND OPTIONS -.TP -.B index -.B bwa index -.RB [ -p -.IR prefix ] -.RB [ -a -.IR algoType ] -.I db.fa - -Index database sequences in the FASTA format. - -.B OPTIONS: -.RS -.TP 10 -.BI -p \ STR -Prefix of the output database [same as db filename] -.TP -.BI -a \ STR -Algorithm for constructing BWT index. BWA implements two algorithms for BWT -construction: -.B is -and -.BR bwtsw . -The first algorithm is a little faster for small database but requires large -RAM and does not work for databases with total length longer than 2GB. The -second algorithm is adapted from the BWT-SW source code. It in theory works -with database with trillions of bases. When this option is not specified, the -appropriate algorithm will be chosen automatically. -.RE - -.TP -.B mem -.B bwa mem -.RB [ -aCHMpP ] -.RB [ -t -.IR nThreads ] -.RB [ -k -.IR minSeedLen ] -.RB [ -w -.IR bandWidth ] -.RB [ -d -.IR zDropoff ] -.RB [ -r -.IR seedSplitRatio ] -.RB [ -c -.IR maxOcc ] -.RB [ -A -.IR matchScore ] -.RB [ -B -.IR mmPenalty ] -.RB [ -O -.IR gapOpenPen ] -.RB [ -E -.IR gapExtPen ] -.RB [ -L -.IR clipPen ] -.RB [ -U -.IR unpairPen ] -.RB [ -R -.IR RGline ] -.RB [ -v -.IR verboseLevel ] -.I db.prefix -.I reads.fq -.RI [ mates.fq ] - -Align 70bp-1Mbp query sequences with the BWA-MEM algorithm. Briefly, the -algorithm works by seeding alignments with maximal exact matches (MEMs) and -then extending seeds with the affine-gap Smith-Waterman algorithm (SW). - -If -.I mates.fq -file is absent and option -.B -p -is not set, this command regards input reads are single-end. If -.I mates.fq -is present, this command assumes the -.IR i -th -read in -.I reads.fq -and the -.IR i -th -read in -.I mates.fq -constitute a read pair. If -.B -p -is used, the command assumes the -.RI 2 i -th -and the -.RI (2 i +1)-th -read in -.I reads.fq -constitute a read pair (such input file is said to be interleaved). In this case, -.I mates.fq -is ignored. In the paired-end mode, the -.B mem -command will infer the read orientation and the insert size distribution from a -batch of reads. - -The BWA-MEM algorithm performs local alignment. It may produce multiple primary -alignments for different part of a query sequence. This is a crucial feature -for long sequences. However, some tools such as Picard's markDuplicates does -not work with split alignments. One may consider to use option -.B -M -to flag shorter split hits as secondary. - -.RS -.TP 10 -.B ALGORITHM OPTIONS: -.TP -.BI -t \ INT -Number of threads [1] -.TP -.BI -k \ INT -Minimum seed length. Matches shorter than -.I INT -will be missed. The alignment speed is usually insensitive to this value unless -it significantly deviates from 20. [19] -.TP -.BI -w \ INT -Band width. Essentially, gaps longer than -.I INT -will not be found. Note that the maximum gap length is also affected by the -scoring matrix and the hit length, not solely determined by this option. [100] -.TP -.BI -d \ INT -Off-diagonal X-dropoff (Z-dropoff). Stop extension when the difference between -the best and the current extension score is above -.RI | i - j |* A + INT , -where -.I i -and -.I j -are the current positions of the query and reference, respectively, and -.I A -is the matching score. Z-dropoff is similar to BLAST's X-dropoff except that it -doesn't penalize gaps in one of the sequences in the alignment. Z-dropoff not -only avoids unnecessary extension, but also reduces poor alignments inside a -long good alignment. [100] -.TP -.BI -r \ FLOAT -Trigger re-seeding for a MEM longer than -.IR minSeedLen * FLOAT . -This is a key heuristic parameter for tuning the performance. Larger value -yields fewer seeds, which leads to faster alignment speed but lower accuracy. [1.5] -.TP -.BI -c \ INT -Discard a MEM if it has more than -.I INT -occurence in the genome. This is an insensitive parameter. [500] -.TP -.B -P -In the paired-end mode, perform SW to rescue missing hits only but do not try to find -hits that fit a proper pair. -.TP -.BI -A \ INT -Matching score. [1] -.TP -.BI -B \ INT -Mismatch penalty. The sequence error rate is approximately: {.75 * exp[-log(4) * B/A]}. [4] -.TP -.BI -O \ INT[,INT] -Gap open penalty. If two numbers are specified, the first is the penalty of -openning a deletion and the second for openning an insertion. [6] -.TP -.BI -E \ INT[,INT] -Gap extension penalty. If two numbers are specified, the first is the penalty -of extending a deletion and second for extending an insertion. A gap of length -k costs O + k*E (i.e. -.B -O -is for opening a zero-length gap). [1] -.TP -.BI -L \ INT[,INT] -Clipping penalty. When performing SW extension, BWA-MEM keeps track of the best -score reaching the end of query. If this score is larger than the best SW score -minus the clipping penalty, clipping will not be applied. Note that in this -case, the SAM AS tag reports the best SW score; clipping penalty is not -deduced. If two numbers are provided, the first is for 5'-end clipping and -second for 3'-end clipping. [5] -.TP -.BI -U \ INT -Penalty for an unpaired read pair. BWA-MEM scores an unpaired read pair as -.RI scoreRead1+scoreRead2- INT -and scores a paired as scoreRead1+scoreRead2-insertPenalty. It compares these -two scores to determine whether we should force pairing. A larger value leads to -more aggressive read pair. [17] - -.TP -.B INPUT/OUTPUT OPTIONS: -.TP -.B -p -Assume the first input query file is interleaved paired-end FASTA/Q. See the -command description for details. -.TP -.BI -R \ STR -Complete read group header line. '\\t' can be used in -.I STR -and will be converted to a TAB in the output SAM. The read group ID will be -attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. -[null] -.TP -.BI -T \ INT -Don't output alignment with score lower than -.IR INT . -This option affects output and occasionally SAM flag 2. [30] -.TP -.BI -h \ INT -If a query has not more than -.I INT -hits with score higher than 80% of the best hit, output them all in the XA tag [5] -.TP -.B -a -Output all found alignments for single-end or unpaired paired-end reads. These -alignments will be flagged as secondary alignments. -.TP -.B -C -Append append FASTA/Q comment to SAM output. This option can be used to -transfer read meta information (e.g. barcode) to the SAM output. Note that the -FASTA/Q comment (the string after a space in the header line) must conform the SAM -spec (e.g. BC:Z:CGTAC). Malformated comments lead to incorrect SAM output. -.TP -.B -Y -Use soft clipping CIGAR operation for supplementary alignments. By default, BWA-MEM -uses soft clipping for the primary alignment and hard clipping for -supplementary alignments. -.TP -.B -M -Mark shorter split hits as secondary (for Picard compatibility). -.TP -.BI -v \ INT -Control the verbose level of the output. This option has not been fully -supported throughout BWA. Ideally, a value 0 for disabling all the output to -stderr; 1 for outputting errors only; 2 for warnings and errors; 3 for -all normal messages; 4 or higher for debugging. When this option takes value -4, the output is not SAM. [3] -.TP -.BI -I \ FLOAT[,FLOAT[,INT[,INT]]] -Specify the mean, standard deviation (10% of the mean if absent), max (4 sigma -from the mean if absent) and min (4 sigma if absent) of the insert size -distribution. Only applicable to the FR orientation. By default, BWA-MEM infers -these numbers and the pair orientations given enough reads. [inferred] - -.RE - -.TP -.B aln -bwa aln [-n maxDiff] [-o maxGapO] [-e maxGapE] [-d nDelTail] [-i -nIndelEnd] [-k maxSeedDiff] [-l seedLen] [-t nThrds] [-cRN] [-M misMsc] -[-O gapOsc] [-E gapEsc] [-q trimQual] > - - -Find the SA coordinates of the input reads. Maximum -.I maxSeedDiff -differences are allowed in the first -.I seedLen -subsequence and maximum -.I maxDiff -differences are allowed in the whole sequence. - -.B OPTIONS: -.RS -.TP 10 -.BI -n \ NUM -Maximum edit distance if the value is INT, or the fraction of missing -alignments given 2% uniform base error rate if FLOAT. In the latter -case, the maximum edit distance is automatically chosen for different -read lengths. [0.04] -.TP -.BI -o \ INT -Maximum number of gap opens [1] -.TP -.BI -e \ INT -Maximum number of gap extensions, -1 for k-difference mode (disallowing -long gaps) [-1] -.TP -.BI -d \ INT -Disallow a long deletion within INT bp towards the 3'-end [16] -.TP -.BI -i \ INT -Disallow an indel within INT bp towards the ends [5] -.TP -.BI -l \ INT -Take the first INT subsequence as seed. If INT is larger than the query -sequence, seeding will be disabled. For long reads, this option is -typically ranged from 25 to 35 for `-k 2'. [inf] -.TP -.BI -k \ INT -Maximum edit distance in the seed [2] -.TP -.BI -t \ INT -Number of threads (multi-threading mode) [1] -.TP -.BI -M \ INT -Mismatch penalty. BWA will not search for suboptimal hits with a score -lower than (bestScore-misMsc). [3] -.TP -.BI -O \ INT -Gap open penalty [11] -.TP -.BI -E \ INT -Gap extension penalty [4] -.TP -.BI -R \ INT -Proceed with suboptimal alignments if there are no more than INT equally -best hits. This option only affects paired-end mapping. Increasing this -threshold helps to improve the pairing accuracy at the cost of speed, -especially for short reads (~32bp). -.TP -.B -c -Reverse query but not complement it, which is required for alignment in -the color space. (Disabled since 0.6.x) -.TP -.B -N -Disable iterative search. All hits with no more than -.I maxDiff -differences will be found. This mode is much slower than the default. -.TP -.BI -q \ INT -Parameter for read trimming. BWA trims a read down to -argmax_x{\\sum_{i=x+1}^l(INT-q_i)} if q_l 1.sai - bwa aln ref.fa -b2 reads.bam > 2.sai - bwa sampe ref.fa 1.sai 2.sai reads.bam reads.bam > aln.sam -.TP -.B -0 -When -.B -b -is specified, only use single-end reads in mapping. -.TP -.B -1 -When -.B -b -is specified, only use the first read in a read pair in mapping (skip -single-end reads and the second reads). -.TP -.B -2 -When -.B -b -is specified, only use the second read in a read pair in mapping. -.B -.RE - -.TP -.B samse -bwa samse [-n maxOcc] > - -Generate alignments in the SAM format given single-end reads. Repetitive -hits will be randomly chosen. - -.B OPTIONS: -.RS -.TP 10 -.BI -n \ INT -Maximum number of alignments to output in the XA tag for reads paired -properly. If a read has more than INT hits, the XA tag will not be -written. [3] -.TP -.BI -r \ STR -Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] -.RE - -.TP -.B sampe -bwa sampe [-a maxInsSize] [-o maxOcc] [-n maxHitPaired] [-N maxHitDis] -[-P] > - -Generate alignments in the SAM format given paired-end reads. Repetitive -read pairs will be placed randomly. - -.B OPTIONS: -.RS -.TP 8 -.BI -a \ INT -Maximum insert size for a read pair to be considered being mapped -properly. Since 0.4.5, this option is only used when there are not -enough good alignment to infer the distribution of insert sizes. [500] -.TP -.BI -o \ INT -Maximum occurrences of a read for pairing. A read with more occurrneces -will be treated as a single-end read. Reducing this parameter helps -faster pairing. [100000] -.TP -.B -P -Load the entire FM-index into memory to reduce disk operations -(base-space reads only). With this option, at least 1.25N bytes of -memory are required, where N is the length of the genome. -.TP -.BI -n \ INT -Maximum number of alignments to output in the XA tag for reads paired -properly. If a read has more than INT hits, the XA tag will not be -written. [3] -.TP -.BI -N \ INT -Maximum number of alignments to output in the XA tag for disconcordant -read pairs (excluding singletons). If a read has more than INT hits, the -XA tag will not be written. [10] -.TP -.BI -r \ STR -Specify the read group in a format like `@RG\\tID:foo\\tSM:bar'. [null] -.RE - -.TP -.B bwasw -bwa bwasw [-a matchScore] [-b mmPen] [-q gapOpenPen] [-r gapExtPen] [-t -nThreads] [-w bandWidth] [-T thres] [-s hspIntv] [-z zBest] [-N -nHspRev] [-c thresCoef] [mate.fq] - -Align query sequences in the -.I in.fq -file. When -.I mate.fq -is present, perform paired-end alignment. The paired-end mode only works -for reads Illumina short-insert libraries. In the paired-end mode, BWA-SW -may still output split alignments but they are all marked as not properly -paired; the mate positions will not be written if the mate has multiple -local hits. - -.B OPTIONS: -.RS -.TP 10 -.BI -a \ INT -Score of a match [1] -.TP -.BI -b \ INT -Mismatch penalty [3] -.TP -.BI -q \ INT -Gap open penalty [5] -.TP -.BI -r \ INT -Gap extension penalty. The penalty for a contiguous gap of size k is -q+k*r. [2] -.TP -.BI -t \ INT -Number of threads in the multi-threading mode [1] -.TP -.BI -w \ INT -Band width in the banded alignment [33] -.TP -.BI -T \ INT -Minimum score threshold divided by a [37] -.TP -.BI -c \ FLOAT -Coefficient for threshold adjustment according to query length. Given an -l-long query, the threshold for a hit to be retained is -a*max{T,c*log(l)}. [5.5] -.TP -.BI -z \ INT -Z-best heuristics. Higher -z increases accuracy at the cost of speed. [1] -.TP -.BI -s \ INT -Maximum SA interval size for initiating a seed. Higher -s increases -accuracy at the cost of speed. [3] -.TP -.BI -N \ INT -Minimum number of seeds supporting the resultant alignment to skip -reverse alignment. [5] -.RE - -.SH SAM ALIGNMENT FORMAT -.PP -The output of the -.B `aln' -command is binary and designed for BWA use only. BWA outputs the final -alignment in the SAM (Sequence Alignment/Map) format. Each line consists -of: - -.TS -center box; -cb | cb | cb -n | l | l . -Col Field Description -_ -1 QNAME Query (pair) NAME -2 FLAG bitwise FLAG -3 RNAME Reference sequence NAME -4 POS 1-based leftmost POSition/coordinate of clipped sequence -5 MAPQ MAPping Quality (Phred-scaled) -6 CIAGR extended CIGAR string -7 MRNM Mate Reference sequence NaMe (`=' if same as RNAME) -8 MPOS 1-based Mate POSistion -9 ISIZE Inferred insert SIZE -10 SEQ query SEQuence on the same strand as the reference -11 QUAL query QUALity (ASCII-33 gives the Phred base quality) -12 OPT variable OPTional fields in the format TAG:VTYPE:VALUE -.TE - -.PP -Each bit in the FLAG field is defined as: - -.TS -center box; -cb | cb | cb -c | l | l . -Chr Flag Description -_ -p 0x0001 the read is paired in sequencing -P 0x0002 the read is mapped in a proper pair -u 0x0004 the query sequence itself is unmapped -U 0x0008 the mate is unmapped -r 0x0010 strand of the query (1 for reverse) -R 0x0020 strand of the mate -1 0x0040 the read is the first read in a pair -2 0x0080 the read is the second read in a pair -s 0x0100 the alignment is not primary -f 0x0200 QC failure -d 0x0400 optical or PCR duplicate -.TE - -.PP -The Please check for the format -specification and the tools for post-processing the alignment. - -BWA generates the following optional fields. Tags starting with `X' are -specific to BWA. - -.TS -center box; -cb | cb -cB | l . -Tag Meaning -_ -NM Edit distance -MD Mismatching positions/bases -AS Alignment score -BC Barcode sequence -SA Supplementary alignments -_ -X0 Number of best hits -X1 Number of suboptimal hits found by BWA -XN Number of ambiguous bases in the referenece -XM Number of mismatches in the alignment -XO Number of gap opens -XG Number of gap extentions -XT Type: Unique/Repeat/N/Mate-sw -XA Alternative hits; format: /(chr,pos,CIGAR,NM;)*/ -_ -XS Suboptimal alignment score -XF Support from forward/reverse alignment -XE Number of supporting seeds -_ -XP Alt primary hits; format: /(chr,pos,CIGAR,mapQ,NM;)+/ -.TE - -.PP -Note that XO and XG are generated by BWT search while the CIGAR string -by Smith-Waterman alignment. These two tags may be inconsistent with the -CIGAR string. This is not a bug. - -.SH NOTES ON SHORT-READ ALIGNMENT -.SS Alignment Accuracy -.PP -When seeding is disabled, BWA guarantees to find an alignment -containing maximum -.I maxDiff -differences including -.I maxGapO -gap opens which do not occur within -.I nIndelEnd -bp towards either end of the query. Longer gaps may be found if -.I maxGapE -is positive, but it is not guaranteed to find all hits. When seeding is -enabled, BWA further requires that the first -.I seedLen -subsequence contains no more than -.I maxSeedDiff -differences. -.PP -When gapped alignment is disabled, BWA is expected to generate the same -alignment as Eland version 1, the Illumina alignment program. However, as BWA -change `N' in the database sequence to random nucleotides, hits to these -random sequences will also be counted. As a consequence, BWA may mark a -unique hit as a repeat, if the random sequences happen to be identical -to the sequences which should be unqiue in the database. -.PP -By default, if the best hit is not highly repetitive (controlled by -R), BWA -also finds all hits contains one more mismatch; otherwise, BWA finds all -equally best hits only. Base quality is NOT considered in evaluating -hits. In the paired-end mode, BWA pairs all hits it found. It further -performs Smith-Waterman alignment for unmapped reads to rescue reads with a -high erro rate, and for high-quality anomalous pairs to fix potential alignment -errors. - -.SS Estimating Insert Size Distribution -.PP -BWA estimates the insert size distribution per 256*1024 read pairs. It -first collects pairs of reads with both ends mapped with a single-end -quality 20 or higher and then calculates median (Q2), lower and higher -quartile (Q1 and Q3). It estimates the mean and the variance of the -insert size distribution from pairs whose insert sizes are within -interval [Q1-2(Q3-Q1), Q3+2(Q3-Q1)]. The maximum distance x for a pair -considered to be properly paired (SAM flag 0x2) is calculated by solving -equation Phi((x-mu)/sigma)=x/L*p0, where mu is the mean, sigma is the -standard error of the insert size distribution, L is the length of the -genome, p0 is prior of anomalous pair and Phi() is the standard -cumulative distribution function. For mapping Illumina short-insert -reads to the human genome, x is about 6-7 sigma away from the -mean. Quartiles, mean, variance and x will be printed to the standard -error output. - -.SS Memory Requirement -.PP -With bwtsw algorithm, 5GB memory is required for indexing the complete -human genome sequences. For short reads, the -.B aln -command uses ~3.2GB memory and the -.B sampe -command uses ~5.4GB. - -.SS Speed -.PP -Indexing the human genome sequences takes 3 hours with bwtsw -algorithm. Indexing smaller genomes with IS algorithms is -faster, but requires more memory. -.PP -The speed of alignment is largely determined by the error rate of the query -sequences (r). Firstly, BWA runs much faster for near perfect hits than -for hits with many differences, and it stops searching for a hit with -l+2 differences if a l-difference hit is found. This means BWA will be -very slow if r is high because in this case BWA has to visit hits with -many differences and looking for these hits is expensive. Secondly, the -alignment algorithm behind makes the speed sensitive to [k log(N)/m], -where k is the maximum allowed differences, N the size of database and m -the length of a query. In practice, we choose k w.r.t. r and therefore r -is the leading factor. I would not recommend to use BWA on data with -r>0.02. -.PP -Pairing is slower for shorter reads. This is mainly because shorter -reads have more spurious hits and converting SA coordinates to -chromosomal coordinates are very costly. - -.SH CHANGES IN BWA-0.6 -.PP -Since version 0.6, BWA has been able to work with a reference genome longer than 4GB. -This feature makes it possible to integrate the forward and reverse complemented -genome in one FM-index, which speeds up both BWA-short and BWA-SW. As a tradeoff, -BWA uses more memory because it has to keep all positions and ranks in 64-bit -integers, twice larger than 32-bit integers used in the previous versions. - -The latest BWA-SW also works for paired-end reads longer than 100bp. In -comparison to BWA-short, BWA-SW tends to be more accurate for highly unique -reads and more robust to relative long INDELs and structural variants. -Nonetheless, BWA-short usually has higher power to distinguish the optimal hit -from many suboptimal hits. The choice of the mapping algorithm may depend on -the application. - -.SH SEE ALSO -BWA website , Samtools website - - -.SH AUTHOR -Heng Li at the Sanger Institute wrote the key source codes and -integrated the following codes for BWT construction: bwtsw -, implemented by Chi-Kwong Wong at -the University of Hong Kong and IS - originally proposed by Nong Ge - at the Sun Yat-Sen University and -implemented by Yuta Mori. - -.SH LICENSE AND CITATION -.PP -The full BWA package is distributed under GPLv3 as it uses source codes -from BWT-SW which is covered by GPL. Sorting, hash table, BWT and IS -libraries are distributed under the MIT license. -.PP -If you use the BWA-backtrack algorithm, please cite the following -paper: -.PP -Li H. and Durbin R. (2009) Fast and accurate short read alignment with -Burrows-Wheeler transform. Bioinformatics, 25, 1754-1760. [PMID: 19451168] -.PP -If you use the BWA-SW algorithm, please cite: -.PP -Li H. and Durbin R. (2010) Fast and accurate long-read alignment with -Burrows-Wheeler transform. Bioinformatics, 26, 589-595. [PMID: 20080505] -.PP -If you use BWA-MEM or the fastmap component of BWA, please cite: -.PP -Li H. (2013) Aligning sequence reads, clone sequences and assembly contigs with -BWA-MEM. arXiv:1303.3997v1 [q-bio.GN]. -.PP -It is likely that the BWA-MEM manuscript will not appear in a peer-reviewed -journal. - -.SH HISTORY -BWA is largely influenced by BWT-SW. It uses source codes from BWT-SW -and mimics its binary file formats; BWA-SW resembles BWT-SW in several -ways. The initial idea about BWT-based alignment also came from the -group who developed BWT-SW. At the same time, BWA is different enough -from BWT-SW. The short-read alignment algorithm bears no similarity to -Smith-Waterman algorithm any more. While BWA-SW learns from BWT-SW, it -introduces heuristics that can hardly be applied to the original -algorithm. In all, BWA does not guarantee to find all local hits as what -BWT-SW is designed to do, but it is much faster than BWT-SW on both -short and long query sequences. - -I started to write the first piece of codes on 24 May 2008 and got the -initial stable version on 02 June 2008. During this period, I was -acquainted that Professor Tak-Wah Lam, the first author of BWT-SW paper, -was collaborating with Beijing Genomics Institute on SOAP2, the successor -to SOAP (Short Oligonucleotide Analysis Package). SOAP2 has come out in -November 2008. According to the SourceForge download page, the third -BWT-based short read aligner, bowtie, was first released in August -2008. At the time of writing this manual, at least three more BWT-based -short-read aligners are being implemented. - -The BWA-SW algorithm is a new component of BWA. It was conceived in -November 2008 and implemented ten months later. - -The BWA-MEM algorithm is based on an algorithm finding super-maximal exact -matches (SMEMs), which was first published with the fermi assembler paper -in 2012. I first implemented the basic SMEM algorithm in the -.B fastmap -command for an experiment and then extended the basic algorithm and added the -extension part in Feburary 2013 to make BWA-MEM a fully featured mapper. - diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwa.c --- a/bwa-0.7.9a/bwa.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,321 +0,0 @@ -#include -#include -#include -#include -#include "bntseq.h" -#include "bwa.h" -#include "ksw.h" -#include "utils.h" -#include "kstring.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -int bwa_verbose = 3; -char bwa_rg_id[256]; - -/************************ - * Batch FASTA/Q reader * - ************************/ - -#include "kseq.h" -KSEQ_DECLARE(gzFile) - -static inline void trim_readno(kstring_t *s) -{ - if (s->l > 2 && s->s[s->l-2] == '/' && isdigit(s->s[s->l-1])) - s->l -= 2, s->s[s->l] = 0; -} - -static inline void kseq2bseq1(const kseq_t *ks, bseq1_t *s) -{ // TODO: it would be better to allocate one chunk of memory, but probably it does not matter in practice - s->name = strdup(ks->name.s); - s->comment = ks->comment.l? strdup(ks->comment.s) : 0; - s->seq = strdup(ks->seq.s); - s->qual = ks->qual.l? strdup(ks->qual.s) : 0; - s->l_seq = strlen(s->seq); -} - -bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_) -{ - kseq_t *ks = (kseq_t*)ks1_, *ks2 = (kseq_t*)ks2_; - int size = 0, m, n; - bseq1_t *seqs; - m = n = 0; seqs = 0; - while (kseq_read(ks) >= 0) { - if (ks2 && kseq_read(ks2) < 0) { // the 2nd file has fewer reads - fprintf(stderr, "[W::%s] the 2nd file has fewer sequences.\n", __func__); - break; - } - if (n >= m) { - m = m? m<<1 : 256; - seqs = realloc(seqs, m * sizeof(bseq1_t)); - } - trim_readno(&ks->name); - kseq2bseq1(ks, &seqs[n]); - size += seqs[n++].l_seq; - if (ks2) { - trim_readno(&ks2->name); - kseq2bseq1(ks2, &seqs[n]); - size += seqs[n++].l_seq; - } - if (size >= chunk_size && (n&1) == 0) break; - } - if (size == 0) { // test if the 2nd file is finished - if (ks2 && kseq_read(ks2) >= 0) - fprintf(stderr, "[W::%s] the 1st file has fewer sequences.\n", __func__); - } - *n_ = n; - return seqs; -} - -/***************** - * CIGAR related * - *****************/ - -void bwa_fill_scmat(int a, int b, int8_t mat[25]) -{ - int i, j, k; - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? a : -b; - mat[k++] = -1; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = -1; -} - -// Generate CIGAR when the alignment end points are known -uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) -{ - uint32_t *cigar = 0; - uint8_t tmp, *rseq; - int i; - int64_t rlen; - kstring_t str; - const char *int2base; - - if (n_cigar) *n_cigar = 0; - if (NM) *NM = -1; - if (l_query <= 0 || rb >= re || (rb < l_pac && re > l_pac)) return 0; // reject if negative length or bridging the forward and reverse strand - rseq = bns_get_seq(l_pac, pac, rb, re, &rlen); - if (re - rb != rlen) goto ret_gen_cigar; // possible if out of range - if (rb >= l_pac) { // then reverse both query and rseq; this is to ensure indels to be placed at the leftmost position - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - for (i = 0; i < rlen>>1; ++i) - tmp = rseq[i], rseq[i] = rseq[rlen - 1 - i], rseq[rlen - 1 - i] = tmp; - } - if (l_query == re - rb && w_ == 0) { // no gap; no need to do DP - // UPDATE: we come to this block now... FIXME: due to an issue in mem_reg2aln(), we never come to this block. This does not affect accuracy, but it hurts performance. - if (n_cigar) { - cigar = malloc(4); - cigar[0] = l_query<<4 | 0; - *n_cigar = 1; - } - for (i = 0, *score = 0; i < l_query; ++i) - *score += mat[rseq[i]*5 + query[i]]; - } else { - int w, max_gap, max_ins, max_del, min_w; - // set the band-width - max_ins = (int)((double)(((l_query+1)>>1) * mat[0] - o_ins) / e_ins + 1.); - max_del = (int)((double)(((l_query+1)>>1) * mat[0] - o_del) / e_del + 1.); - max_gap = max_ins > max_del? max_ins : max_del; - max_gap = max_gap > 1? max_gap : 1; - w = (max_gap + abs(rlen - l_query) + 1) >> 1; - w = w < w_? w : w_; - min_w = abs(rlen - l_query) + 3; - w = w > min_w? w : min_w; - // NW alignment - if (bwa_verbose >= 4) { - printf("* Global bandwidth: %d\n", w); - printf("* Global ref: "); for (i = 0; i < rlen; ++i) putchar("ACGTN"[(int)rseq[i]]); putchar('\n'); - printf("* Global query: "); for (i = 0; i < l_query; ++i) putchar("ACGTN"[(int)query[i]]); putchar('\n'); - } - *score = ksw_global2(l_query, query, rlen, rseq, 5, mat, o_del, e_del, o_ins, e_ins, w, n_cigar, &cigar); - } - if (NM && n_cigar) {// compute NM and MD - int k, x, y, u, n_mm = 0, n_gap = 0; - str.l = str.m = *n_cigar * 4; str.s = (char*)cigar; // append MD to CIGAR - int2base = rb < l_pac? "ACGTN" : "TGCAN"; - for (k = 0, x = y = u = 0; k < *n_cigar; ++k) { - int op, len; - cigar = (uint32_t*)str.s; - op = cigar[k]&0xf, len = cigar[k]>>4; - if (op == 0) { // match - for (i = 0; i < len; ++i) { - if (query[x + i] != rseq[y + i]) { - kputw(u, &str); - kputc(int2base[rseq[y+i]], &str); - ++n_mm; u = 0; - } else ++u; - } - x += len; y += len; - } else if (op == 2) { // deletion - if (k > 0 && k < *n_cigar - 1) { // don't do the following if D is the first or the last CIGAR - kputw(u, &str); kputc('^', &str); - for (i = 0; i < len; ++i) - kputc(int2base[rseq[y+i]], &str); - u = 0; n_gap += len; - } - y += len; - } else if (op == 1) x += len, n_gap += len; // insertion - } - kputw(u, &str); kputc(0, &str); - *NM = n_mm + n_gap; - cigar = (uint32_t*)str.s; - } - if (rb >= l_pac) // reverse back query - for (i = 0; i < l_query>>1; ++i) - tmp = query[i], query[i] = query[l_query - 1 - i], query[l_query - 1 - i] = tmp; - -ret_gen_cigar: - free(rseq); - return cigar; -} - -uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM) -{ - return bwa_gen_cigar2(mat, q, r, q, r, w_, l_pac, pac, l_query, query, rb, re, score, n_cigar, NM); -} - -/********************* - * Full index reader * - *********************/ - -char *bwa_idx_infer_prefix(const char *hint) -{ - char *prefix; - int l_hint; - FILE *fp; - l_hint = strlen(hint); - prefix = malloc(l_hint + 3 + 4 + 1); - strcpy(prefix, hint); - strcpy(prefix + l_hint, ".64.bwt"); - if ((fp = fopen(prefix, "rb")) != 0) { - fclose(fp); - prefix[l_hint + 3] = 0; - return prefix; - } else { - strcpy(prefix + l_hint, ".bwt"); - if ((fp = fopen(prefix, "rb")) == 0) { - free(prefix); - return 0; - } else { - fclose(fp); - prefix[l_hint] = 0; - return prefix; - } - } -} - -bwt_t *bwa_idx_load_bwt(const char *hint) -{ - char *tmp, *prefix; - bwt_t *bwt; - prefix = bwa_idx_infer_prefix(hint); - if (prefix == 0) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); - return 0; - } - tmp = calloc(strlen(prefix) + 5, 1); - strcat(strcpy(tmp, prefix), ".bwt"); // FM-index - bwt = bwt_restore_bwt(tmp); - strcat(strcpy(tmp, prefix), ".sa"); // partial suffix array (SA) - bwt_restore_sa(tmp, bwt); - free(tmp); free(prefix); - return bwt; -} - -bwaidx_t *bwa_idx_load(const char *hint, int which) -{ - bwaidx_t *idx; - char *prefix; - prefix = bwa_idx_infer_prefix(hint); - if (prefix == 0) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to locate the index files\n", __func__); - return 0; - } - idx = calloc(1, sizeof(bwaidx_t)); - if (which & BWA_IDX_BWT) idx->bwt = bwa_idx_load_bwt(hint); - if (which & BWA_IDX_BNS) { - idx->bns = bns_restore(prefix); - if (which & BWA_IDX_PAC) { - idx->pac = calloc(idx->bns->l_pac/4+1, 1); - err_fread_noeof(idx->pac, 1, idx->bns->l_pac/4+1, idx->bns->fp_pac); // concatenated 2-bit encoded sequence - err_fclose(idx->bns->fp_pac); - idx->bns->fp_pac = 0; - } - } - free(prefix); - return idx; -} - -void bwa_idx_destroy(bwaidx_t *idx) -{ - if (idx == 0) return; - if (idx->bwt) bwt_destroy(idx->bwt); - if (idx->bns) bns_destroy(idx->bns); - if (idx->pac) free(idx->pac); - free(idx); -} - -/*********************** - * SAM header routines * - ***********************/ - -void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line) -{ - int i; - extern char *bwa_pg; - for (i = 0; i < bns->n_seqs; ++i) - err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[i].name, bns->anns[i].len); - if (rg_line) err_printf("%s\n", rg_line); - err_printf("%s\n", bwa_pg); -} - -static char *bwa_escape(char *s) -{ - char *p, *q; - for (p = q = s; *p; ++p) { - if (*p == '\\') { - ++p; - if (*p == 't') *q++ = '\t'; - else if (*p == 'n') *q++ = '\n'; - else if (*p == 'r') *q++ = '\r'; - else if (*p == '\\') *q++ = '\\'; - } else *q++ = *p; - } - *q = '\0'; - return s; -} - -char *bwa_set_rg(const char *s) -{ - char *p, *q, *r, *rg_line = 0; - memset(bwa_rg_id, 0, 256); - if (strstr(s, "@RG") != s) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] the read group line is not started with @RG\n", __func__); - goto err_set_rg; - } - rg_line = strdup(s); - bwa_escape(rg_line); - if ((p = strstr(rg_line, "\tID:")) == 0) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] no ID at the read group line\n", __func__); - goto err_set_rg; - } - p += 4; - for (q = p; *q && *q != '\t' && *q != '\n'; ++q); - if (q - p + 1 > 256) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] @RG:ID is longer than 255 characters\n", __func__); - goto err_set_rg; - } - for (q = p, r = bwa_rg_id; *q && *q != '\t' && *q != '\n'; ++q) - *r++ = *q; - return rg_line; - -err_set_rg: - free(rg_line); - return 0; -} - diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwa.h --- a/bwa-0.7.9a/bwa.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,50 +0,0 @@ -#ifndef BWA_H_ -#define BWA_H_ - -#include -#include "bntseq.h" -#include "bwt.h" - -#define BWA_IDX_BWT 0x1 -#define BWA_IDX_BNS 0x2 -#define BWA_IDX_PAC 0x4 -#define BWA_IDX_ALL 0x7 - -typedef struct { - bwt_t *bwt; // FM-index - bntseq_t *bns; // information on the reference sequences - uint8_t *pac; // the actual 2-bit encoded reference sequences with 'N' converted to a random base -} bwaidx_t; - -typedef struct { - int l_seq; - char *name, *comment, *seq, *qual, *sam; -} bseq1_t; - -extern int bwa_verbose; -extern char bwa_rg_id[256]; - -#ifdef __cplusplus -extern "C" { -#endif - - bseq1_t *bseq_read(int chunk_size, int *n_, void *ks1_, void *ks2_); - - void bwa_fill_scmat(int a, int b, int8_t mat[25]); - uint32_t *bwa_gen_cigar(const int8_t mat[25], int q, int r, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); - uint32_t *bwa_gen_cigar2(const int8_t mat[25], int o_del, int e_del, int o_ins, int e_ins, int w_, int64_t l_pac, const uint8_t *pac, int l_query, uint8_t *query, int64_t rb, int64_t re, int *score, int *n_cigar, int *NM); - - char *bwa_idx_infer_prefix(const char *hint); - bwt_t *bwa_idx_load_bwt(const char *hint); - - bwaidx_t *bwa_idx_load(const char *hint, int which); - void bwa_idx_destroy(bwaidx_t *idx); - - void bwa_print_sam_hdr(const bntseq_t *bns, const char *rg_line); - char *bwa_set_rg(const char *s); - -#ifdef __cplusplus -} -#endif - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwape.c --- a/bwa-0.7.9a/bwape.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,783 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include "bwtaln.h" -#include "kvec.h" -#include "bntseq.h" -#include "utils.h" -#include "bwase.h" -#include "bwa.h" -#include "ksw.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -typedef struct { - int n; - bwtint_t *a; -} poslist_t; - -typedef struct { - double avg, std, ap_prior; - bwtint_t low, high, high_bayesian; -} isize_info_t; - -#define b128_eq(a, b) ((a).x == (b).x && (a).y == (b).y) -#define b128_hash(a) ((uint32_t)(a).x) - -#include "khash.h" -KHASH_INIT(b128, pair64_t, poslist_t, 1, b128_hash, b128_eq) - -typedef struct { - pair64_v arr; - pair64_v pos[2]; - kvec_t(bwt_aln1_t) aln[2]; -} pe_data_t; - -#define MIN_HASH_WIDTH 1000 - -extern int g_log_n[256]; // in bwase.c -static kh_b128_t *g_hash; - -void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi); -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); -int bwa_approx_mapQ(const bwa_seq_t *p, int mm); -void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2); -bntseq_t *bwa_open_nt(const char *prefix); -void bwa_print_sam_SQ(const bntseq_t *bns); - -pe_opt_t *bwa_init_pe_opt() -{ - pe_opt_t *po; - po = (pe_opt_t*)calloc(1, sizeof(pe_opt_t)); - po->max_isize = 500; - po->force_isize = 0; - po->max_occ = 100000; - po->n_multi = 3; - po->N_multi = 10; - po->type = BWA_PET_STD; - po->is_sw = 1; - po->ap_prior = 1e-5; - return po; -} -/* -static double ierfc(double x) // inverse erfc(); iphi(x) = M_SQRT2 *ierfc(2 * x); -{ - const double a = 0.140012; - double b, c; - b = log(x * (2 - x)); - c = 2./M_PI/a + b / 2.; - return sqrt(sqrt(c * c - b / a) - c); -} -*/ - -// for normal distribution, this is about 3std -#define OUTLIER_BOUND 2.0 - -static int infer_isize(int n_seqs, bwa_seq_t *seqs[2], isize_info_t *ii, double ap_prior, int64_t L) -{ - uint64_t x, *isizes, n_ap = 0; - int n, i, tot, p25, p75, p50, max_len = 1, tmp; - double skewness = 0.0, kurtosis = 0.0, y; - - ii->avg = ii->std = -1.0; - ii->low = ii->high = ii->high_bayesian = 0; - isizes = (uint64_t*)calloc(n_seqs, 8); - for (i = 0, tot = 0; i != n_seqs; ++i) { - bwa_seq_t *p[2]; - p[0] = seqs[0] + i; p[1] = seqs[1] + i; - if (p[0]->mapQ >= 20 && p[1]->mapQ >= 20) { - x = (p[0]->pos < p[1]->pos)? p[1]->pos + p[1]->len - p[0]->pos : p[0]->pos + p[0]->len - p[1]->pos; - if (x < 100000) isizes[tot++] = x; - } - if (p[0]->len > max_len) max_len = p[0]->len; - if (p[1]->len > max_len) max_len = p[1]->len; - } - if (tot < 20) { - fprintf(stderr, "[infer_isize] fail to infer insert size: too few good pairs\n"); - free(isizes); - return -1; - } - ks_introsort_64(tot, isizes); - p25 = isizes[(int)(tot*0.25 + 0.5)]; - p50 = isizes[(int)(tot*0.50 + 0.5)]; - p75 = isizes[(int)(tot*0.75 + 0.5)]; - tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); - ii->low = tmp > max_len? tmp : max_len; // ii->low is unsigned - ii->high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); - if (ii->low > ii->high) { - fprintf(stderr, "[infer_isize] fail to infer insert size: upper bound is smaller than read length\n"); - free(isizes); - return -1; - } - for (i = 0, x = n = 0; i < tot; ++i) - if (isizes[i] >= ii->low && isizes[i] <= ii->high) - ++n, x += isizes[i]; - ii->avg = (double)x / n; - for (i = 0; i < tot; ++i) { - if (isizes[i] >= ii->low && isizes[i] <= ii->high) { - double tmp = (isizes[i] - ii->avg) * (isizes[i] - ii->avg); - ii->std += tmp; - skewness += tmp * (isizes[i] - ii->avg); - kurtosis += tmp * tmp; - } - } - kurtosis = kurtosis/n / (ii->std / n * ii->std / n) - 3; - ii->std = sqrt(ii->std / n); // it would be better as n-1, but n is usually very large - skewness = skewness / n / (ii->std * ii->std * ii->std); - for (y = 1.0; y < 10.0; y += 0.01) - if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; - ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); - for (i = 0; i < tot; ++i) - if (isizes[i] > ii->high_bayesian) ++n_ap; - ii->ap_prior = .01 * (n_ap + .01) / tot; - if (ii->ap_prior < ap_prior) ii->ap_prior = ap_prior; - free(isizes); - fprintf(stderr, "[infer_isize] (25, 50, 75) percentile: (%d, %d, %d)\n", p25, p50, p75); - if (isnan(ii->std) || p75 > 100000) { - ii->low = ii->high = ii->high_bayesian = 0; ii->avg = ii->std = -1.0; - fprintf(stderr, "[infer_isize] fail to infer insert size: weird pairing\n"); - return -1; - } - for (y = 1.0; y < 10.0; y += 0.01) - if (.5 * erfc(y / M_SQRT2) < ap_prior / L * (y * ii->std + ii->avg)) break; - ii->high_bayesian = (bwtint_t)(y * ii->std + ii->avg + .499); - fprintf(stderr, "[infer_isize] low and high boundaries: %ld and %ld for estimating avg and std\n", (long)ii->low, (long)ii->high); - fprintf(stderr, "[infer_isize] inferred external isize from %d pairs: %.3lf +/- %.3lf\n", n, ii->avg, ii->std); - fprintf(stderr, "[infer_isize] skewness: %.3lf; kurtosis: %.3lf; ap_prior: %.2e\n", skewness, kurtosis, ii->ap_prior); - fprintf(stderr, "[infer_isize] inferred maximum insert size: %ld (%.2lf sigma)\n", (long)ii->high_bayesian, y); - return 0; -} - -static int pairing(bwa_seq_t *p[2], pe_data_t *d, const pe_opt_t *opt, int s_mm, const isize_info_t *ii) -{ - int i, j, o_n, subo_n, cnt_chg = 0, low_bound = ii->low, max_len; - uint64_t o_score, subo_score; - pair64_t last_pos[2][2], o_pos[2]; - max_len = p[0]->full_len; - if (max_len < p[1]->full_len) max_len = p[1]->full_len; - if (low_bound < max_len) low_bound = max_len; - - // here v>=u. When ii is set, we check insert size with ii; otherwise with opt->max_isize -#define __pairing_aux(u,v) do { \ - bwtint_t l = (v).x + p[(v).y&1]->len - ((u).x); \ - if ((u).x != (uint64_t)-1 && (v).x > (u).x && l >= max_len \ - && ((ii->high && l <= ii->high_bayesian) || (ii->high == 0 && l <= opt->max_isize))) \ - { \ - uint64_t s = d->aln[(v).y&1].a[(v).y>>2].score + d->aln[(u).y&1].a[(u).y>>2].score; \ - s *= 10; \ - if (ii->high) s += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * fabs(l - ii->avg) / ii->std)) + .499); \ - s = s<<32 | (uint32_t)hash_64((u).x<<32 | (v).x); \ - if (s>>32 == o_score>>32) ++o_n; \ - else if (s>>32 < o_score>>32) { subo_n += o_n; o_n = 1; } \ - else ++subo_n; \ - if (s < o_score) subo_score = o_score, o_score = s, o_pos[(u).y&1] = (u), o_pos[(v).y&1] = (v); \ - else if (s < subo_score) subo_score = s; \ - } \ - } while (0) - -#define __pairing_aux2(q, w) do { \ - const bwt_aln1_t *r = d->aln[(w).y&1].a + ((w).y>>2); \ - (q)->extra_flag |= SAM_FPP; \ - if ((q)->pos != (w).x || (q)->strand != ((w).y>>1&1)) { \ - (q)->n_mm = r->n_mm; (q)->n_gapo = r->n_gapo; (q)->n_gape = r->n_gape; (q)->strand = (w).y>>1&1; \ - (q)->score = r->score; \ - (q)->pos = (w).x; \ - if ((q)->mapQ > 0) ++cnt_chg; \ - } \ - } while (0) - - o_score = subo_score = (uint64_t)-1; - o_n = subo_n = 0; - ks_introsort_128(d->arr.n, d->arr.a); - for (j = 0; j < 2; ++j) last_pos[j][0].x = last_pos[j][0].y = last_pos[j][1].x = last_pos[j][1].y = (uint64_t)-1; - if (opt->type == BWA_PET_STD) { - for (i = 0; i < d->arr.n; ++i) { - pair64_t x = d->arr.a[i]; - int strand = x.y>>1&1; - if (strand == 1) { // reverse strand, then check - int y = 1 - (x.y&1); - __pairing_aux(last_pos[y][1], x); - __pairing_aux(last_pos[y][0], x); - } else { // forward strand, then push - last_pos[x.y&1][0] = last_pos[x.y&1][1]; - last_pos[x.y&1][1] = x; - } - } - } else { - fprintf(stderr, "[paring] not implemented yet!\n"); - exit(1); - } - // set pairing - //fprintf(stderr, "[%ld, %d, %d, %d]\n", d->arr.n, (int)(o_score>>32), (int)(subo_score>>32), o_n); - if (o_score != (uint64_t)-1) { - int mapQ_p = 0; // this is the maximum mapping quality when one end is moved - //fprintf(stderr, "%d, %d\n", o_n, subo_n); - if (o_n == 1) { - if (subo_score == (uint64_t)-1) mapQ_p = 29; // no sub-optimal pair - else if ((subo_score>>32) - (o_score>>32) > s_mm * 10) mapQ_p = 23; // poor sub-optimal pair - else { - int n = subo_n > 255? 255 : subo_n; - mapQ_p = ((subo_score>>32) - (o_score>>32)) / 2 - g_log_n[n]; - if (mapQ_p < 0) mapQ_p = 0; - } - } - if ((p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) && (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1))) { // both ends not moved - if (p[0]->mapQ > 0 && p[1]->mapQ > 0) { - int mapQ = p[0]->mapQ + p[1]->mapQ; - if (mapQ > 60) mapQ = 60; - p[0]->mapQ = p[1]->mapQ = mapQ; - } else { - if (p[0]->mapQ == 0) p[0]->mapQ = (mapQ_p + 7 < p[1]->mapQ)? mapQ_p + 7 : p[1]->mapQ; - if (p[1]->mapQ == 0) p[1]->mapQ = (mapQ_p + 7 < p[0]->mapQ)? mapQ_p + 7 : p[0]->mapQ; - } - } else if (p[0]->pos == o_pos[0].x && p[0]->strand == (o_pos[0].y>>1&1)) { // [1] moved - p[1]->seQ = 0; p[1]->mapQ = p[0]->mapQ; - if (p[1]->mapQ > mapQ_p) p[1]->mapQ = mapQ_p; - } else if (p[1]->pos == o_pos[1].x && p[1]->strand == (o_pos[1].y>>1&1)) { // [0] moved - p[0]->seQ = 0; p[0]->mapQ = p[1]->mapQ; - if (p[0]->mapQ > mapQ_p) p[0]->mapQ = mapQ_p; - } else { // both ends moved - p[0]->seQ = p[1]->seQ = 0; - mapQ_p -= 20; - if (mapQ_p < 0) mapQ_p = 0; - p[0]->mapQ = p[1]->mapQ = mapQ_p; - } - __pairing_aux2(p[0], o_pos[0]); - __pairing_aux2(p[1], o_pos[1]); - } - return cnt_chg; -} - -typedef struct { - kvec_t(bwt_aln1_t) aln; -} aln_buf_t; - -int bwa_cal_pac_pos_pe(const bntseq_t *bns, const char *prefix, bwt_t *const _bwt, int n_seqs, bwa_seq_t *seqs[2], FILE *fp_sa[2], isize_info_t *ii, - const pe_opt_t *opt, const gap_opt_t *gopt, const isize_info_t *last_ii) -{ - int i, j, cnt_chg = 0; - char str[1024]; - bwt_t *bwt; - pe_data_t *d; - aln_buf_t *buf[2]; - - d = (pe_data_t*)calloc(1, sizeof(pe_data_t)); - buf[0] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); - buf[1] = (aln_buf_t*)calloc(n_seqs, sizeof(aln_buf_t)); - - if (_bwt == 0) { // load forward SA - strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); - } else bwt = _bwt; - - // SE - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p[2]; - for (j = 0; j < 2; ++j) { - int n_aln; - p[j] = seqs[j] + i; - p[j]->n_multi = 0; - p[j]->extra_flag |= SAM_FPD | (j == 0? SAM_FR1 : SAM_FR2); - err_fread_noeof(&n_aln, 4, 1, fp_sa[j]); - if (n_aln > kv_max(d->aln[j])) - kv_resize(bwt_aln1_t, d->aln[j], n_aln); - d->aln[j].n = n_aln; - err_fread_noeof(d->aln[j].a, sizeof(bwt_aln1_t), n_aln, fp_sa[j]); - kv_copy(bwt_aln1_t, buf[j][i].aln, d->aln[j]); // backup d->aln[j] - // generate SE alignment and mapping quality - bwa_aln2seq(n_aln, d->aln[j].a, p[j]); - if (p[j]->type == BWA_TYPE_UNIQUE || p[j]->type == BWA_TYPE_REPEAT) { - int strand; - int max_diff = gopt->fnr > 0.0? bwa_cal_maxdiff(p[j]->len, BWA_AVG_ERR, gopt->fnr) : gopt->max_diff; - p[j]->seQ = p[j]->mapQ = bwa_approx_mapQ(p[j], max_diff); - p[j]->pos = bwa_sa2pos(bns, bwt, p[j]->sa, p[j]->len + p[j]->ref_shift, &strand); - p[j]->strand = strand; - if (p[j]->pos == (bwtint_t)-1) p[j]->type = BWA_TYPE_NO_MATCH; - } - } - } - - // infer isize - infer_isize(n_seqs, seqs, ii, opt->ap_prior, bwt->seq_len/2); - if (ii->avg < 0.0 && last_ii->avg > 0.0) *ii = *last_ii; - if (opt->force_isize) { - fprintf(stderr, "[%s] discard insert size estimate as user's request.\n", __func__); - ii->low = ii->high = 0; ii->avg = ii->std = -1.0; - } - - // PE - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p[2]; - for (j = 0; j < 2; ++j) { - p[j] = seqs[j] + i; - kv_copy(bwt_aln1_t, d->aln[j], buf[j][i].aln); - } - if ((p[0]->type == BWA_TYPE_UNIQUE || p[0]->type == BWA_TYPE_REPEAT) - && (p[1]->type == BWA_TYPE_UNIQUE || p[1]->type == BWA_TYPE_REPEAT)) - { // only when both ends mapped - pair64_t x; - int j, k; - long long n_occ[2]; - for (j = 0; j < 2; ++j) { - n_occ[j] = 0; - for (k = 0; k < d->aln[j].n; ++k) - n_occ[j] += d->aln[j].a[k].l - d->aln[j].a[k].k + 1; - } - if (n_occ[0] > opt->max_occ || n_occ[1] > opt->max_occ) continue; - d->arr.n = 0; - for (j = 0; j < 2; ++j) { - for (k = 0; k < d->aln[j].n; ++k) { - bwt_aln1_t *r = d->aln[j].a + k; - bwtint_t l; - if (0 && r->l - r->k + 1 >= MIN_HASH_WIDTH) { // then check hash table - pair64_t key; - int ret; - key.x = r->k; key.y = r->l; - khint_t iter = kh_put(b128, g_hash, key, &ret); - if (ret) { // not in the hash table; ret must equal 1 as we never remove elements - poslist_t *z = &kh_val(g_hash, iter); - z->n = r->l - r->k + 1; - z->a = (bwtint_t*)malloc(sizeof(bwtint_t) * z->n); - for (l = r->k; l <= r->l; ++l) { - int strand; - z->a[l - r->k] = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand)<<1; - z->a[l - r->k] |= strand; - } - } - for (l = 0; l < kh_val(g_hash, iter).n; ++l) { - x.x = kh_val(g_hash, iter).a[l]>>1; - x.y = k<<2 | (kh_val(g_hash, iter).a[l]&1)<<1 | j; - kv_push(pair64_t, d->arr, x); - } - } else { // then calculate on the fly - for (l = r->k; l <= r->l; ++l) { - int strand; - x.x = bwa_sa2pos(bns, bwt, l, p[j]->len + p[j]->ref_shift, &strand); - x.y = k<<2 | strand<<1 | j; - kv_push(pair64_t, d->arr, x); - } - } - } - } - cnt_chg += pairing(p, d, opt, gopt->s_mm, ii); - } - - if (opt->N_multi || opt->n_multi) { - for (j = 0; j < 2; ++j) { - if (p[j]->type != BWA_TYPE_NO_MATCH) { - int k, n_multi; - if (!(p[j]->extra_flag&SAM_FPP) && p[1-j]->type != BWA_TYPE_NO_MATCH) { - bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, p[j]->c1+p[j]->c2-1 > opt->N_multi? opt->n_multi : opt->N_multi); - } else bwa_aln2seq_core(d->aln[j].n, d->aln[j].a, p[j], 0, opt->n_multi); - for (k = 0, n_multi = 0; k < p[j]->n_multi; ++k) { - int strand; - bwt_multi1_t *q = p[j]->multi + k; - q->pos = bwa_sa2pos(bns, bwt, q->pos, p[j]->len + q->ref_shift, &strand); - q->strand = strand; - if (q->pos != p[j]->pos) - p[j]->multi[n_multi++] = *q; - } - p[j]->n_multi = n_multi; - } - } - } - } - - // free - for (i = 0; i < n_seqs; ++i) { - kv_destroy(buf[0][i].aln); - kv_destroy(buf[1][i].aln); - } - free(buf[0]); free(buf[1]); - if (_bwt == 0) bwt_destroy(bwt); - kv_destroy(d->arr); - kv_destroy(d->pos[0]); kv_destroy(d->pos[1]); - kv_destroy(d->aln[0]); kv_destroy(d->aln[1]); - free(d); - return cnt_chg; -} - -#define SW_MIN_MATCH_LEN 20 -#define SW_MIN_MAPQ 17 - -// cnt = n_mm<<16 | n_gapo<<8 | n_gape -bwa_cigar_t *bwa_sw_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, const ubyte_t *seq, int64_t *beg, int reglen, int *n_cigar, uint32_t *_cnt) -{ - kswr_t r; - uint32_t *cigar32 = 0; - bwa_cigar_t *cigar = 0; - ubyte_t *ref_seq; - bwtint_t k, x, y, l; - int xtra, gscore; - int8_t mat[25]; - - bwa_fill_scmat(1, 3, mat); - // check whether there are too many N's - if (reglen < SW_MIN_MATCH_LEN || (int64_t)l_pac - *beg < len) return 0; - for (k = 0, x = 0; k < len; ++k) - if (seq[k] >= 4) ++x; - if ((float)x/len >= 0.25 || len - x < SW_MIN_MATCH_LEN) return 0; - - // get reference subsequence - ref_seq = (ubyte_t*)calloc(reglen, 1); - for (k = *beg, l = 0; l < reglen && k < l_pac; ++k) - ref_seq[l++] = pacseq[k>>2] >> ((~k&3)<<1) & 3; - - // do alignment - xtra = KSW_XSUBO | KSW_XSTART | (len < 250? KSW_XBYTE : 0); - r = ksw_align(len, (uint8_t*)seq, l, ref_seq, 5, mat, 5, 1, xtra, 0); - gscore = ksw_global(r.qe - r.qb + 1, &seq[r.qb], r.te - r.tb + 1, &ref_seq[r.tb], 5, mat, 5, 1, 50, n_cigar, &cigar32); - cigar = (bwa_cigar_t*)cigar32; - for (k = 0; k < *n_cigar; ++k) - cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); - - if (r.score < SW_MIN_MATCH_LEN || r.score2 == r.score || gscore != r.score) { // poor hit or tandem hits or weird alignment - free(cigar); free(ref_seq); *n_cigar = 0; - return 0; - } - - // check whether the alignment is good enough - for (k = 0, x = y = 0; k < *n_cigar; ++k) { - bwa_cigar_t c = cigar[k]; - if (__cigar_op(c) == FROM_M) x += __cigar_len(c), y += __cigar_len(c); - else if (__cigar_op(c) == FROM_D) x += __cigar_len(c); - else y += __cigar_len(c); - } - if (x < SW_MIN_MATCH_LEN || y < SW_MIN_MATCH_LEN) { // not good enough - free(cigar); free(ref_seq); - *n_cigar = 0; - return 0; - } - - { // update cigar and coordinate; - int start = r.qb, end = r.qe + 1; - *beg += r.tb; - cigar = (bwa_cigar_t*)realloc(cigar, sizeof(bwa_cigar_t) * (*n_cigar + 2)); - if (start) { - memmove(cigar + 1, cigar, sizeof(bwa_cigar_t) * (*n_cigar)); - cigar[0] = __cigar_create(3, start); - ++(*n_cigar); - } - if (end < len) { - /*cigar[*n_cigar] = 3<<14 | (len - end);*/ - cigar[*n_cigar] = __cigar_create(3, (len - end)); - ++(*n_cigar); - } - } - - { // set *cnt - int n_mm, n_gapo, n_gape; - n_mm = n_gapo = n_gape = 0; - x = r.tb; y = r.qb; - for (k = 0; k < *n_cigar; ++k) { - bwa_cigar_t c = cigar[k]; - if (__cigar_op(c) == FROM_M) { - for (l = 0; l < (__cigar_len(c)); ++l) - if (ref_seq[x+l] < 4 && seq[y+l] < 4 && ref_seq[x+l] != seq[y+l]) ++n_mm; - x += __cigar_len(c), y += __cigar_len(c); - } else if (__cigar_op(c) == FROM_D) { - x += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; - } else if (__cigar_op(c) == FROM_I) { - y += __cigar_len(c), ++n_gapo, n_gape += (__cigar_len(c)) - 1; - } - } - *_cnt = (uint32_t)n_mm<<16 | n_gapo<<8 | n_gape; - } - - free(ref_seq); - return cigar; -} - -ubyte_t *bwa_paired_sw(const bntseq_t *bns, const ubyte_t *_pacseq, int n_seqs, bwa_seq_t *seqs[2], const pe_opt_t *popt, const isize_info_t *ii) -{ - ubyte_t *pacseq; - int i; - uint64_t n_tot[2], n_mapped[2]; - - // load reference sequence - if (_pacseq == 0) { - pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - err_rewind(bns->fp_pac); - err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); - } else pacseq = (ubyte_t*)_pacseq; - if (!popt->is_sw || ii->avg < 0.0) return pacseq; - - // perform mate alignment - n_tot[0] = n_tot[1] = n_mapped[0] = n_mapped[1] = 0; - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p[2]; - p[0] = seqs[0] + i; p[1] = seqs[1] + i; - if ((p[0]->mapQ >= SW_MIN_MAPQ || p[1]->mapQ >= SW_MIN_MAPQ) && (p[0]->extra_flag&SAM_FPP) == 0) { // unpaired and one read has high mapQ - int k, n_cigar[2], is_singleton, mapQ = 0, mq_adjust[2]; - int64_t beg[2], end[2]; - bwa_cigar_t *cigar[2]; - uint32_t cnt[2]; - - /* In the following, _pref points to the reference read - * which must be aligned; _pmate points to its mate which is - * considered to be modified. */ - -#define __set_rght_coor(_a, _b, _pref, _pmate) do { \ - (_a) = (int64_t)_pref->pos + ii->avg - 3 * ii->std - _pmate->len * 1.5; \ - (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ - if ((_a) < (int64_t)_pref->pos + _pref->len) (_a) = _pref->pos + _pref->len; \ - if ((_b) > bns->l_pac) (_b) = bns->l_pac; \ - } while (0) - -#define __set_left_coor(_a, _b, _pref, _pmate) do { \ - (_a) = (int64_t)_pref->pos + _pref->len - ii->avg - 3 * ii->std - _pmate->len * 0.5; \ - (_b) = (_a) + 6 * ii->std + 2 * _pmate->len; \ - if ((_a) < 0) (_a) = 0; \ - if ((_b) > _pref->pos) (_b) = _pref->pos; \ - } while (0) - -#define __set_fixed(_pref, _pmate, _beg, _cnt) do { \ - _pmate->type = BWA_TYPE_MATESW; \ - _pmate->pos = _beg; \ - _pmate->seQ = _pref->seQ; \ - _pmate->strand = (popt->type == BWA_PET_STD)? 1 - _pref->strand : _pref->strand; \ - _pmate->n_mm = _cnt>>16; _pmate->n_gapo = _cnt>>8&0xff; _pmate->n_gape = _cnt&0xff; \ - _pmate->extra_flag |= SAM_FPP; \ - _pref->extra_flag |= SAM_FPP; \ - } while (0) - - mq_adjust[0] = mq_adjust[1] = 255; // not effective - is_singleton = (p[0]->type == BWA_TYPE_NO_MATCH || p[1]->type == BWA_TYPE_NO_MATCH)? 1 : 0; - - ++n_tot[is_singleton]; - cigar[0] = cigar[1] = 0; - n_cigar[0] = n_cigar[1] = 0; - if (popt->type != BWA_PET_STD) continue; // other types of pairing is not considered - for (k = 0; k < 2; ++k) { // p[1-k] is the reference read and p[k] is the read considered to be modified - ubyte_t *seq; - if (p[1-k]->type == BWA_TYPE_NO_MATCH) continue; // if p[1-k] is unmapped, skip - { // note that popt->type == BWA_PET_STD always true; in older versions, there was a branch for color-space FF/RR reads - if (p[1-k]->strand == 0) { // then the mate is on the reverse strand and has larger coordinate - __set_rght_coor(beg[k], end[k], p[1-k], p[k]); - seq = p[k]->rseq; - } else { // then the mate is on forward stand and has smaller coordinate - __set_left_coor(beg[k], end[k], p[1-k], p[k]); - seq = p[k]->seq; - seq_reverse(p[k]->len, seq, 0); // because ->seq is reversed; this will reversed back shortly - } - } - // perform SW alignment - cigar[k] = bwa_sw_core(bns->l_pac, pacseq, p[k]->len, seq, &beg[k], end[k] - beg[k], &n_cigar[k], &cnt[k]); - if (cigar[k] && p[k]->type != BWA_TYPE_NO_MATCH) { // re-evaluate cigar[k] - int s_old, clip = 0, s_new; - if (__cigar_op(cigar[k][0]) == 3) clip += __cigar_len(cigar[k][0]); - if (__cigar_op(cigar[k][n_cigar[k]-1]) == 3) clip += __cigar_len(cigar[k][n_cigar[k]-1]); - s_old = (int)((p[k]->n_mm * 9 + p[k]->n_gapo * 13 + p[k]->n_gape * 2) / 3. * 8. + .499); - s_new = (int)(((cnt[k]>>16) * 9 + (cnt[k]>>8&0xff) * 13 + (cnt[k]&0xff) * 2 + clip * 3) / 3. * 8. + .499); - s_old += -4.343 * log(ii->ap_prior / bns->l_pac); - s_new += (int)(-4.343 * log(.5 * erfc(M_SQRT1_2 * 1.5) + .499)); // assume the mapped isize is 1.5\sigma - if (s_old < s_new) { // reject SW alignment - mq_adjust[k] = s_new - s_old; - free(cigar[k]); cigar[k] = 0; n_cigar[k] = 0; - } else mq_adjust[k] = s_old - s_new; - } - // now revserse sequence back such that p[*]->seq looks untouched - if (popt->type == BWA_PET_STD) { - if (p[1-k]->strand == 1) seq_reverse(p[k]->len, seq, 0); - } else { - if (p[1-k]->strand == 0) seq_reverse(p[k]->len, seq, 0); - } - } - k = -1; // no read to be changed - if (cigar[0] && cigar[1]) { - k = p[0]->mapQ < p[1]->mapQ? 0 : 1; // p[k] to be fixed - mapQ = abs(p[1]->mapQ - p[0]->mapQ); - } else if (cigar[0]) k = 0, mapQ = p[1]->mapQ; - else if (cigar[1]) k = 1, mapQ = p[0]->mapQ; - if (k >= 0 && p[k]->pos != beg[k]) { - ++n_mapped[is_singleton]; - { // recalculate mapping quality - int tmp = (int)p[1-k]->mapQ - p[k]->mapQ/2 - 8; - if (tmp <= 0) tmp = 1; - if (mapQ > tmp) mapQ = tmp; - p[k]->mapQ = p[1-k]->mapQ = mapQ; - p[k]->seQ = p[1-k]->seQ = p[1-k]->seQ < mapQ? p[1-k]->seQ : mapQ; - if (p[k]->mapQ > mq_adjust[k]) p[k]->mapQ = mq_adjust[k]; - if (p[k]->seQ > mq_adjust[k]) p[k]->seQ = mq_adjust[k]; - } - // update CIGAR - free(p[k]->cigar); p[k]->cigar = cigar[k]; cigar[k] = 0; - p[k]->n_cigar = n_cigar[k]; - // update the rest of information - __set_fixed(p[1-k], p[k], beg[k], cnt[k]); - } - free(cigar[0]); free(cigar[1]); - } - } - fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d singletons are mated.\n", - (long long)n_mapped[1], (long long)n_tot[1], SW_MIN_MAPQ); - fprintf(stderr, "[bwa_paired_sw] %lld out of %lld Q%d discordant pairs are fixed.\n", - (long long)n_mapped[0], (long long)n_tot[0], SW_MIN_MAPQ); - return pacseq; -} - -void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const fn_fa[2], pe_opt_t *popt, const char *rg_line) -{ - extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, j, n_seqs, tot_seqs = 0; - bwa_seq_t *seqs[2]; - bwa_seqio_t *ks[2]; - clock_t t; - bntseq_t *bns; - FILE *fp_sa[2]; - gap_opt_t opt, opt0; - khint_t iter; - isize_info_t last_ii; // this is for the last batch of reads - char str[1024], magic[2][4]; - bwt_t *bwt; - uint8_t *pac; - - // initialization - bwase_initialize(); // initialize g_log_n[] in bwase.c - pac = 0; bwt = 0; - for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); - bns = bns_restore(prefix); - srand48(bns->seed); - fp_sa[0] = xopen(fn_sa[0], "r"); - fp_sa[1] = xopen(fn_sa[1], "r"); - g_hash = kh_init(b128); - last_ii.avg = -1.0; - - err_fread_noeof(magic[0], 1, 4, fp_sa[0]); - err_fread_noeof(magic[1], 1, 4, fp_sa[1]); - if (strncmp(magic[0], SAI_MAGIC, 4) != 0 || strncmp(magic[1], SAI_MAGIC, 4) != 0) { - fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); - exit(1); - } - err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[0]); - ks[0] = bwa_open_reads(opt.mode, fn_fa[0]); - opt0 = opt; - err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa[1]); // overwritten! - ks[1] = bwa_open_reads(opt.mode, fn_fa[1]); - { // for Illumina alignment only - if (popt->is_preload) { - strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); - pac = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - err_rewind(bns->fp_pac); - err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); - } - } - - // core loop - bwa_print_sam_hdr(bns, rg_line); - while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { - int cnt_chg; - isize_info_t ii; - ubyte_t *pacseq; - - seqs[1] = bwa_read_seq(ks[1], 0x40000, &n_seqs, opt.mode, opt.trim_qual); - tot_seqs += n_seqs; - t = clock(); - - fprintf(stderr, "[bwa_sai2sam_pe_core] convert to sequence coordinate... \n"); - cnt_chg = bwa_cal_pac_pos_pe(bns, prefix, bwt, n_seqs, seqs, fp_sa, &ii, popt, &opt, &last_ii); - fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - fprintf(stderr, "[bwa_sai2sam_pe_core] changing coordinates of %d alignments.\n", cnt_chg); - - fprintf(stderr, "[bwa_sai2sam_pe_core] align unmapped mate...\n"); - pacseq = bwa_paired_sw(bns, pac, n_seqs, seqs, popt, &ii); - fprintf(stderr, "[bwa_sai2sam_pe_core] time elapses: %.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - fprintf(stderr, "[bwa_sai2sam_pe_core] refine gapped alignments... "); - for (j = 0; j < 2; ++j) - bwa_refine_gapped(bns, n_seqs, seqs[j], pacseq); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - if (pac == 0) free(pacseq); - - fprintf(stderr, "[bwa_sai2sam_pe_core] print alignments... "); - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *p[2]; - p[0] = seqs[0] + i; p[1] = seqs[1] + i; - if (p[0]->bc[0] || p[1]->bc[0]) { - strcat(p[0]->bc, p[1]->bc); - strcpy(p[1]->bc, p[0]->bc); - } - bwa_print_sam1(bns, p[0], p[1], opt.mode, opt.max_top2); - bwa_print_sam1(bns, p[1], p[0], opt.mode, opt.max_top2); - if (strcmp(p[0]->name, p[1]->name) != 0) err_fatal(__func__, "paired reads have different names: \"%s\", \"%s\"\n", p[0]->name, p[1]->name); - } - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - for (j = 0; j < 2; ++j) - bwa_free_read_seq(n_seqs, seqs[j]); - fprintf(stderr, "[bwa_sai2sam_pe_core] %d sequences have been processed.\n", tot_seqs); - last_ii = ii; - } - - // destroy - bns_destroy(bns); - for (i = 0; i < 2; ++i) { - bwa_seq_close(ks[i]); - err_fclose(fp_sa[i]); - } - for (iter = kh_begin(g_hash); iter != kh_end(g_hash); ++iter) - if (kh_exist(g_hash, iter)) free(kh_val(g_hash, iter).a); - kh_destroy(b128, g_hash); - if (pac) { - free(pac); bwt_destroy(bwt); - } -} - -int bwa_sai2sam_pe(int argc, char *argv[]) -{ - int c; - pe_opt_t *popt; - char *prefix, *rg_line = 0; - - popt = bwa_init_pe_opt(); - while ((c = getopt(argc, argv, "a:o:sPn:N:c:f:Ar:")) >= 0) { - switch (c) { - case 'r': - if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; - break; - case 'a': popt->max_isize = atoi(optarg); break; - case 'o': popt->max_occ = atoi(optarg); break; - case 's': popt->is_sw = 0; break; - case 'P': popt->is_preload = 1; break; - case 'n': popt->n_multi = atoi(optarg); break; - case 'N': popt->N_multi = atoi(optarg); break; - case 'c': popt->ap_prior = atof(optarg); break; - case 'f': xreopen(optarg, "w", stdout); break; - case 'A': popt->force_isize = 1; break; - default: return 1; - } - } - - if (optind + 5 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa sampe [options] \n\n"); - fprintf(stderr, "Options: -a INT maximum insert size [%d]\n", popt->max_isize); - fprintf(stderr, " -o INT maximum occurrences for one end [%d]\n", popt->max_occ); - fprintf(stderr, " -n INT maximum hits to output for paired reads [%d]\n", popt->n_multi); - fprintf(stderr, " -N INT maximum hits to output for discordant pairs [%d]\n", popt->N_multi); - fprintf(stderr, " -c FLOAT prior of chimeric rate (lower bound) [%.1le]\n", popt->ap_prior); - fprintf(stderr, " -f FILE sam file to output results to [stdout]\n"); - fprintf(stderr, " -r STR read group header line such as `@RG\\tID:foo\\tSM:bar' [null]\n"); - fprintf(stderr, " -P preload index into memory (for base-space reads only)\n"); - fprintf(stderr, " -s disable Smith-Waterman for the unmapped mate\n"); - fprintf(stderr, " -A disable insert size estimate (force -s)\n\n"); - fprintf(stderr, "Notes: 1. For SOLiD reads, corresponds R3 reads and to F3.\n"); - fprintf(stderr, " 2. For reads shorter than 30bp, applying a smaller -o is recommended to\n"); - fprintf(stderr, " to get a sensible speed at the cost of pairing accuracy.\n"); - fprintf(stderr, "\n"); - return 1; - } - if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { - fprintf(stderr, "[%s] fail to locate the index\n", __func__); - return 1; - } - bwa_sai2sam_pe_core(prefix, argv + optind + 1, argv + optind+3, popt, rg_line); - free(prefix); free(popt); - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwase.c --- a/bwa-0.7.9a/bwase.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,602 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "bwase.h" -#include "bwtaln.h" -#include "bntseq.h" -#include "utils.h" -#include "kstring.h" -#include "bwa.h" -#include "ksw.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -int g_log_n[256]; - -void bwa_aln2seq_core(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s, int set_main, int n_multi) -{ - int i, cnt, best; - if (n_aln == 0) { - s->type = BWA_TYPE_NO_MATCH; - s->c1 = s->c2 = 0; - return; - } - - if (set_main) { - best = aln[0].score; - for (i = cnt = 0; i < n_aln; ++i) { - const bwt_aln1_t *p = aln + i; - if (p->score > best) break; - if (drand48() * (p->l - p->k + 1 + cnt) > (double)cnt) { - s->n_mm = p->n_mm; s->n_gapo = p->n_gapo; s->n_gape = p->n_gape; - s->ref_shift = (int)p->n_del - (int)p->n_ins; - s->score = p->score; - s->sa = p->k + (bwtint_t)((p->l - p->k + 1) * drand48()); - } - cnt += p->l - p->k + 1; - } - s->c1 = cnt; - for (; i < n_aln; ++i) cnt += aln[i].l - aln[i].k + 1; - s->c2 = cnt - s->c1; - s->type = s->c1 > 1? BWA_TYPE_REPEAT : BWA_TYPE_UNIQUE; - } - - if (n_multi) { - int k, rest, n_occ, z = 0; - for (k = n_occ = 0; k < n_aln; ++k) { - const bwt_aln1_t *q = aln + k; - n_occ += q->l - q->k + 1; - } - if (s->multi) free(s->multi); - if (n_occ > n_multi + 1) { // if there are too many hits, generate none of them - s->multi = 0; s->n_multi = 0; - return; - } - /* The following code is more flexible than what is required - * here. In principle, due to the requirement above, we can - * simply output all hits, but the following samples "rest" - * number of random hits. */ - rest = n_occ > n_multi + 1? n_multi + 1 : n_occ; // find one additional for ->sa - s->multi = calloc(rest, sizeof(bwt_multi1_t)); - for (k = 0; k < n_aln; ++k) { - const bwt_aln1_t *q = aln + k; - if (q->l - q->k + 1 <= rest) { - bwtint_t l; - for (l = q->k; l <= q->l; ++l) { - s->multi[z].pos = l; - s->multi[z].gap = q->n_gapo + q->n_gape; - s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; - s->multi[z++].mm = q->n_mm; - } - rest -= q->l - q->k + 1; - } else { // Random sampling (http://code.activestate.com/recipes/272884/). In fact, we never come here. - int j, i; - for (j = rest, i = q->l - q->k + 1; j > 0; --j) { - double p = 1.0, x = drand48(); - while (x < p) p -= p * j / (i--); - s->multi[z].pos = q->l - i; - s->multi[z].gap = q->n_gapo + q->n_gape; - s->multi[z].ref_shift = (int)q->n_del - (int)q->n_ins; - s->multi[z++].mm = q->n_mm; - } - rest = 0; - break; - } - } - s->n_multi = z; - } -} - -void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s) -{ - bwa_aln2seq_core(n_aln, aln, s, 1, 0); -} - -int bwa_approx_mapQ(const bwa_seq_t *p, int mm) -{ - int n; - if (p->c1 == 0) return 23; - if (p->c1 > 1) return 0; - if (p->n_mm == mm) return 25; - if (p->c2 == 0) return 37; - n = (p->c2 >= 255)? 255 : p->c2; - return (23 < g_log_n[n])? 0 : 23 - g_log_n[n]; -} - -bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int ref_len, int *strand) -{ - bwtint_t pos_f; - int is_rev; - pos_f = bwt_sa(bwt, sapos); // position on the forward-reverse coordinate - if (pos_f < bns->l_pac && bns->l_pac < pos_f + ref_len) return (bwtint_t)-1; - pos_f = bns_depos(bns, pos_f, &is_rev); // position on the forward strand; this may be the first base or the last base - *strand = !is_rev; - if (is_rev) pos_f = pos_f + 1 < ref_len? 0 : pos_f - ref_len + 1; // position of the first base - return pos_f; // FIXME: it is possible that pos_f < bns->anns[ref_id].offset -} - -/** - * Derive the actual position in the read from the given suffix array - * coordinates. Note that the position will be approximate based on - * whether indels appear in the read and whether calculations are - * performed from the start or end of the read. - */ -void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t *bwt, bwa_seq_t *seq, const int max_mm, const float fnr) -{ - int max_diff, strand; - if (seq->type != BWA_TYPE_UNIQUE && seq->type != BWA_TYPE_REPEAT) return; - max_diff = fnr > 0.0? bwa_cal_maxdiff(seq->len, BWA_AVG_ERR, fnr) : max_mm; - seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - //fprintf(stderr, "%d\n", seq->ref_shift); - seq->pos = bwa_sa2pos(bns, bwt, seq->sa, seq->len + seq->ref_shift, &strand); - seq->strand = strand; - seq->seQ = seq->mapQ = bwa_approx_mapQ(seq, max_diff); - if (seq->pos == (bwtint_t)-1) seq->type = BWA_TYPE_NO_MATCH; -} - -void bwa_cal_pac_pos(const bntseq_t *bns, const char *prefix, int n_seqs, bwa_seq_t *seqs, int max_mm, float fnr) -{ - int i, j, strand, n_multi; - char str[1024]; - bwt_t *bwt; - // load forward SA - strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); - strcpy(str, prefix); strcat(str, ".sa"); bwt_restore_sa(str, bwt); - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p = &seqs[i]; - bwa_cal_pac_pos_core(bns, bwt, p, max_mm, fnr); - for (j = n_multi = 0; j < p->n_multi; ++j) { - bwt_multi1_t *q = p->multi + j; - q->pos = bwa_sa2pos(bns, bwt, q->pos, p->len + q->ref_shift, &strand); - q->strand = strand; - if (q->pos != p->pos && q->pos != (bwtint_t)-1) - p->multi[n_multi++] = *q; - } - p->n_multi = n_multi; - } - bwt_destroy(bwt); -} - -#define SW_BW 50 - -bwa_cigar_t *bwa_refine_gapped_core(bwtint_t l_pac, const ubyte_t *pacseq, int len, ubyte_t *seq, int ref_shift, bwtint_t *_rb, int *n_cigar) -{ - bwa_cigar_t *cigar = 0; - uint32_t *cigar32 = 0; - ubyte_t *rseq; - int64_t k, rb, re, rlen; - int8_t mat[25]; - - bwa_fill_scmat(1, 3, mat); - rb = *_rb; re = rb + len + ref_shift; - assert(re <= l_pac); - rseq = bns_get_seq(l_pac, pacseq, rb, re, &rlen); - assert(re - rb == rlen); - ksw_global(len, seq, rlen, rseq, 5, mat, 5, 1, SW_BW > abs(rlen - len) * 1.5? SW_BW : abs(rlen - len) * 1.5, n_cigar, &cigar32); - assert(*n_cigar > 0); - if ((cigar32[*n_cigar - 1]&0xf) == 1) cigar32[*n_cigar - 1] = (cigar32[*n_cigar - 1]>>4<<4) | 3; // change endding ins to soft clipping - if ((cigar32[0]&0xf) == 1) cigar32[0] = (cigar32[0]>>4<<4) | 3; // change beginning ins to soft clipping - if ((cigar32[*n_cigar - 1]&0xf) == 2) --*n_cigar; // delete endding del - if ((cigar32[0]&0xf) == 2) { // delete beginning del - *_rb += cigar32[0]>>4; - --*n_cigar; - memmove(cigar32, cigar32+1, (*n_cigar) * 4); - } - cigar = (bwa_cigar_t*)cigar32; - for (k = 0; k < *n_cigar; ++k) - cigar[k] = __cigar_create((cigar32[k]&0xf), (cigar32[k]>>4)); - free(rseq); - return cigar; -} - -char *bwa_cal_md1(int n_cigar, bwa_cigar_t *cigar, int len, bwtint_t pos, ubyte_t *seq, - bwtint_t l_pac, ubyte_t *pacseq, kstring_t *str, int *_nm) -{ - bwtint_t x, y; - int z, u, c, nm = 0; - str->l = 0; // reset - x = pos; y = 0; - if (cigar) { - int k, l; - for (k = u = 0; k < n_cigar; ++k) { - l = __cigar_len(cigar[k]); - if (__cigar_op(cigar[k]) == FROM_M) { - for (z = 0; z < l && x+z < l_pac; ++z) { - c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; - if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { - ksprintf(str, "%d", u); - kputc("ACGTN"[c], str); - ++nm; - u = 0; - } else ++u; - } - x += l; y += l; - } else if (__cigar_op(cigar[k]) == FROM_I || __cigar_op(cigar[k]) == FROM_S) { - y += l; - if (__cigar_op(cigar[k]) == FROM_I) nm += l; - } else if (__cigar_op(cigar[k]) == FROM_D) { - ksprintf(str, "%d", u); - kputc('^', str); - for (z = 0; z < l && x+z < l_pac; ++z) - kputc("ACGT"[pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3], str); - u = 0; - x += l; nm += l; - } - } - } else { // no gaps - for (z = u = 0; z < (bwtint_t)len && x+z < l_pac; ++z) { - c = pacseq[(x+z)>>2] >> ((~(x+z)&3)<<1) & 3; - if (c > 3 || seq[y+z] > 3 || c != seq[y+z]) { - ksprintf(str, "%d", u); - kputc("ACGTN"[c], str); - ++nm; - u = 0; - } else ++u; - } - } - ksprintf(str, "%d", u); - *_nm = nm; - return strdup(str->s); -} - -void bwa_correct_trimmed(bwa_seq_t *s) -{ - if (s->len == s->full_len) return; - if (s->strand == 0) { // forward - if (s->cigar && __cigar_op(s->cigar[s->n_cigar-1]) == FROM_S) { // the last is S - s->cigar[s->n_cigar-1] += s->full_len - s->len; - } else { - if (s->cigar == 0) { - s->n_cigar = 2; - s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); - s->cigar[0] = __cigar_create(0, s->len); - } else { - ++s->n_cigar; - s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); - } - s->cigar[s->n_cigar-1] = __cigar_create(3, (s->full_len - s->len)); - } - } else { // reverse - if (s->cigar && __cigar_op(s->cigar[0]) == FROM_S) { // the first is S - s->cigar[0] += s->full_len - s->len; - } else { - if (s->cigar == 0) { - s->n_cigar = 2; - s->cigar = calloc(s->n_cigar, sizeof(bwa_cigar_t)); - s->cigar[1] = __cigar_create(0, s->len); - } else { - ++s->n_cigar; - s->cigar = realloc(s->cigar, s->n_cigar * sizeof(bwa_cigar_t)); - memmove(s->cigar + 1, s->cigar, (s->n_cigar-1) * sizeof(bwa_cigar_t)); - } - s->cigar[0] = __cigar_create(3, (s->full_len - s->len)); - } - } - s->len = s->full_len; -} - -void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq) -{ - ubyte_t *pacseq; - int i, j, k; - kstring_t *str; - - if (!_pacseq) { - pacseq = (ubyte_t*)calloc(bns->l_pac/4+1, 1); - err_rewind(bns->fp_pac); - err_fread_noeof(pacseq, 1, bns->l_pac/4+1, bns->fp_pac); - } else pacseq = _pacseq; - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - seq_reverse(s->len, s->seq, 0); // IMPORTANT: s->seq is reversed here!!! - for (j = k = 0; j < s->n_multi; ++j) { - bwt_multi1_t *q = s->multi + j; - int n_cigar; - if (q->gap) { // gapped alignment - q->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, q->strand? s->rseq : s->seq, q->ref_shift, &q->pos, &n_cigar); - q->n_cigar = n_cigar; - if (q->cigar) s->multi[k++] = *q; - } else s->multi[k++] = *q; - } - s->n_multi = k; // this squeezes out gapped alignments which failed the CIGAR generation - if (s->type == BWA_TYPE_NO_MATCH || s->type == BWA_TYPE_MATESW || s->n_gapo == 0) continue; - s->cigar = bwa_refine_gapped_core(bns->l_pac, pacseq, s->len, s->strand? s->rseq : s->seq, s->ref_shift, &s->pos, &s->n_cigar); - if (s->cigar == 0) s->type = BWA_TYPE_NO_MATCH; - } - // generate MD tag - str = (kstring_t*)calloc(1, sizeof(kstring_t)); - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *s = seqs + i; - if (s->type != BWA_TYPE_NO_MATCH) { - int nm; - s->md = bwa_cal_md1(s->n_cigar, s->cigar, s->len, s->pos, s->strand? s->rseq : s->seq, bns->l_pac, pacseq, str, &nm); - s->nm = nm; - } - } - free(str->s); free(str); - - // correct for trimmed reads - for (i = 0; i < n_seqs; ++i) bwa_correct_trimmed(seqs + i); - - if (!_pacseq) free(pacseq); -} - -int64_t pos_end(const bwa_seq_t *p) -{ - if (p->cigar) { - int j; - int64_t x = p->pos; - for (j = 0; j != p->n_cigar; ++j) { - int op = __cigar_op(p->cigar[j]); - if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); - } - return x; - } else return p->pos + p->len; -} - -int64_t pos_end_multi(const bwt_multi1_t *p, int len) // analogy to pos_end() -{ - if (p->cigar) { - int j; - int64_t x = p->pos; - for (j = 0; j != p->n_cigar; ++j) { - int op = __cigar_op(p->cigar[j]); - if (op == 0 || op == 2) x += __cigar_len(p->cigar[j]); - } - return x; - } else return p->pos + len; -} - -static int64_t pos_5(const bwa_seq_t *p) -{ - if (p->type != BWA_TYPE_NO_MATCH) - return p->strand? pos_end(p) : p->pos; - return -1; -} - -void bwa_print_seq(FILE *stream, bwa_seq_t *seq) { - char buffer[4096]; - const int bsz = sizeof(buffer); - int i, j, l; - - if (seq->strand == 0) { - for (i = 0; i < seq->full_len; i += bsz) { - l = seq->full_len - i > bsz ? bsz : seq->full_len - i; - for (j = 0; j < l; j++) buffer[j] = "ACGTN"[seq->seq[i + j]]; - err_fwrite(buffer, 1, l, stream); - } - } else { - for (i = seq->full_len - 1; i >= 0; i -= bsz) { - l = i + 1 > bsz ? bsz : i + 1; - for (j = 0; j < l; j++) buffer[j] = "TGCAN"[seq->seq[i - j]]; - err_fwrite(buffer, 1, l, stream); - } - } -} - -void bwa_print_sam1(const bntseq_t *bns, bwa_seq_t *p, const bwa_seq_t *mate, int mode, int max_top2) -{ - int j; - if (p->type != BWA_TYPE_NO_MATCH || (mate && mate->type != BWA_TYPE_NO_MATCH)) { - int seqid, nn, am = 0, flag = p->extra_flag; - char XT; - - if (p->type == BWA_TYPE_NO_MATCH) { - p->pos = mate->pos; - p->strand = mate->strand; - flag |= SAM_FSU; - j = 1; - } else j = pos_end(p) - p->pos; // j is the length of the reference in the alignment - - // get seqid - nn = bns_cnt_ambi(bns, p->pos, j, &seqid); - if (p->type != BWA_TYPE_NO_MATCH && p->pos + j - bns->anns[seqid].offset > bns->anns[seqid].len) - flag |= SAM_FSU; // flag UNMAP as this alignment bridges two adjacent reference sequences - - // update flag and print it - if (p->strand) flag |= SAM_FSR; - if (mate) { - if (mate->type != BWA_TYPE_NO_MATCH) { - if (mate->strand) flag |= SAM_FMR; - } else flag |= SAM_FMU; - } - err_printf("%s\t%d\t%s\t", p->name, flag, bns->anns[seqid].name); - err_printf("%d\t%d\t", (int)(p->pos - bns->anns[seqid].offset + 1), p->mapQ); - - // print CIGAR - if (p->cigar) { - for (j = 0; j != p->n_cigar; ++j) - err_printf("%d%c", __cigar_len(p->cigar[j]), "MIDS"[__cigar_op(p->cigar[j])]); - } else if (p->type == BWA_TYPE_NO_MATCH) err_printf("*"); - else err_printf("%dM", p->len); - - // print mate coordinate - if (mate && mate->type != BWA_TYPE_NO_MATCH) { - int m_seqid; - long long isize; - am = mate->seQ < p->seQ? mate->seQ : p->seQ; // smaller single-end mapping quality - // redundant calculation here, but should not matter too much - bns_cnt_ambi(bns, mate->pos, mate->len, &m_seqid); - err_printf("\t%s\t", (seqid == m_seqid)? "=" : bns->anns[m_seqid].name); - isize = (seqid == m_seqid)? pos_5(mate) - pos_5(p) : 0; - if (p->type == BWA_TYPE_NO_MATCH) isize = 0; - err_printf("%d\t%lld\t", (int)(mate->pos - bns->anns[m_seqid].offset + 1), isize); - } else if (mate) err_printf("\t=\t%d\t0\t", (int)(p->pos - bns->anns[seqid].offset + 1)); - else err_printf("\t*\t0\t0\t"); - - // print sequence and quality - bwa_print_seq(stdout, p); - err_putchar('\t'); - if (p->qual) { - if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality - err_printf("%s", p->qual); - } else err_printf("*"); - - if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); - if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); - if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); - if (p->type != BWA_TYPE_NO_MATCH) { - int i; - // calculate XT tag - XT = "NURM"[p->type]; - if (nn > 10) XT = 'N'; - // print tags - err_printf("\tXT:A:%c\t%s:i:%d", XT, (mode & BWA_MODE_COMPREAD)? "NM" : "CM", p->nm); - if (nn) err_printf("\tXN:i:%d", nn); - if (mate) err_printf("\tSM:i:%d\tAM:i:%d", p->seQ, am); - if (p->type != BWA_TYPE_MATESW) { // X0 and X1 are not available for this type of alignment - err_printf("\tX0:i:%d", p->c1); - if (p->c1 <= max_top2) err_printf("\tX1:i:%d", p->c2); - } - err_printf("\tXM:i:%d\tXO:i:%d\tXG:i:%d", p->n_mm, p->n_gapo, p->n_gapo+p->n_gape); - if (p->md) err_printf("\tMD:Z:%s", p->md); - // print multiple hits - if (p->n_multi) { - err_printf("\tXA:Z:"); - for (i = 0; i < p->n_multi; ++i) { - bwt_multi1_t *q = p->multi + i; - int k; - j = pos_end_multi(q, p->len) - q->pos; - nn = bns_cnt_ambi(bns, q->pos, j, &seqid); - err_printf("%s,%c%d,", bns->anns[seqid].name, q->strand? '-' : '+', - (int)(q->pos - bns->anns[seqid].offset + 1)); - if (q->cigar) { - for (k = 0; k < q->n_cigar; ++k) - err_printf("%d%c", __cigar_len(q->cigar[k]), "MIDS"[__cigar_op(q->cigar[k])]); - } else err_printf("%dM", p->len); - err_printf(",%d;", q->gap + q->mm); - } - } - } - err_putchar('\n'); - } else { // this read has no match - //ubyte_t *s = p->strand? p->rseq : p->seq; - int flag = p->extra_flag | SAM_FSU; - if (mate && mate->type == BWA_TYPE_NO_MATCH) flag |= SAM_FMU; - err_printf("%s\t%d\t*\t0\t0\t*\t*\t0\t0\t", p->name, flag); - //Why did this work differently to the version above?? - //for (j = 0; j != p->len; ++j) putchar("ACGTN"[(int)s[j]]); - bwa_print_seq(stdout, p); - err_putchar('\t'); - if (p->qual) { - if (p->strand) seq_reverse(p->len, p->qual, 0); // reverse quality - err_printf("%s", p->qual); - } else err_printf("*"); - if (bwa_rg_id[0]) err_printf("\tRG:Z:%s", bwa_rg_id); - if (p->bc[0]) err_printf("\tBC:Z:%s", p->bc); - if (p->clip_len < p->full_len) err_printf("\tXC:i:%d", p->clip_len); - err_putchar('\n'); - } -} - -void bwase_initialize() -{ - int i; - for (i = 1; i != 256; ++i) g_log_n[i] = (int)(4.343 * log(i) + 0.5); -} - -void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_fa, int n_occ, const char *rg_line) -{ - extern bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa); - int i, n_seqs, tot_seqs = 0, m_aln; - bwt_aln1_t *aln = 0; - bwa_seq_t *seqs; - bwa_seqio_t *ks; - clock_t t; - bntseq_t *bns; - FILE *fp_sa; - gap_opt_t opt; - char magic[4]; - - // initialization - bwase_initialize(); - bns = bns_restore(prefix); - srand48(bns->seed); - fp_sa = xopen(fn_sa, "r"); - - m_aln = 0; - err_fread_noeof(magic, 1, 4, fp_sa); - if (strncmp(magic, SAI_MAGIC, 4) != 0) { - fprintf(stderr, "[E::%s] Unmatched SAI magic. Please re-run `aln' with the same version of bwa.\n", __func__); - exit(1); - } - err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); - bwa_print_sam_hdr(bns, rg_line); - // set ks - ks = bwa_open_reads(opt.mode, fn_fa); - // core loop - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt.mode, opt.trim_qual)) != 0) { - tot_seqs += n_seqs; - t = clock(); - - // read alignment - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - int n_aln; - err_fread_noeof(&n_aln, 4, 1, fp_sa); - if (n_aln > m_aln) { - m_aln = n_aln; - aln = (bwt_aln1_t*)realloc(aln, sizeof(bwt_aln1_t) * m_aln); - } - err_fread_noeof(aln, sizeof(bwt_aln1_t), n_aln, fp_sa); - bwa_aln2seq_core(n_aln, aln, p, 1, n_occ); - } - - fprintf(stderr, "[bwa_aln_core] convert to sequence coordinate... "); - bwa_cal_pac_pos(bns, prefix, n_seqs, seqs, opt.max_diff, opt.fnr); // forward bwt will be destroyed here - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - fprintf(stderr, "[bwa_aln_core] refine gapped alignments... "); - bwa_refine_gapped(bns, n_seqs, seqs, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); t = clock(); - - fprintf(stderr, "[bwa_aln_core] print alignments... "); - for (i = 0; i < n_seqs; ++i) - bwa_print_sam1(bns, seqs + i, 0, opt.mode, opt.max_top2); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - - bwa_free_read_seq(n_seqs, seqs); - fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); - } - - // destroy - bwa_seq_close(ks); - bns_destroy(bns); - err_fclose(fp_sa); - free(aln); -} - -int bwa_sai2sam_se(int argc, char *argv[]) -{ - int c, n_occ = 3; - char *prefix, *rg_line = 0; - while ((c = getopt(argc, argv, "hn:f:r:")) >= 0) { - switch (c) { - case 'h': break; - case 'r': - if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; - break; - case 'n': n_occ = atoi(optarg); break; - case 'f': xreopen(optarg, "w", stdout); break; - default: return 1; - } - } - - if (optind + 3 > argc) { - fprintf(stderr, "Usage: bwa samse [-n max_occ] [-f out.sam] [-r RG_line] \n"); - return 1; - } - if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { - fprintf(stderr, "[%s] fail to locate the index\n", __func__); - return 1; - } - bwa_sai2sam_se_core(prefix, argv[optind+1], argv[optind+2], n_occ, rg_line); - free(prefix); - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwase.h --- a/bwa-0.7.9a/bwase.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -#ifndef BWASE_H -#define BWASE_H - -#include "bntseq.h" -#include "bwt.h" -#include "bwtaln.h" - -#ifdef __cplusplus -extern "C" { -#endif - - // Initialize mapping tables in the bwa single-end mapper. - void bwase_initialize(); - // Calculate the approximate position of the sequence from the specified bwt with loaded suffix array. - void bwa_cal_pac_pos_core(const bntseq_t *bns, const bwt_t* bwt, bwa_seq_t* seq, const int max_mm, const float fnr); - // Refine the approximate position of the sequence to an actual placement for the sequence. - void bwa_refine_gapped(const bntseq_t *bns, int n_seqs, bwa_seq_t *seqs, ubyte_t *_pacseq); - // Backfill certain alignment properties mainly centering around number of matches. - void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); - // Calculate the end position of a read given a certain sequence. - int64_t pos_end(const bwa_seq_t *p); - // - bwtint_t bwa_sa2pos(const bntseq_t *bns, const bwt_t *bwt, bwtint_t sapos, int len, int *strand); - -#ifdef __cplusplus -} -#endif - -#endif // BWASE_H diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwaseqio.c --- a/bwa-0.7.9a/bwaseqio.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,235 +0,0 @@ -#include -#include -#include "bwtaln.h" -#include "utils.h" -#include "bamlite.h" - -#include "kseq.h" -KSEQ_DECLARE(gzFile) - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -extern unsigned char nst_nt4_table[256]; -static char bam_nt16_nt4_table[] = { 4, 0, 1, 4, 2, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4 }; - -struct __bwa_seqio_t { - // for BAM input - int is_bam, which; // 1st bit: read1, 2nd bit: read2, 3rd: SE - bamFile fp; - // for fastq input - kseq_t *ks; -}; - -bwa_seqio_t *bwa_bam_open(const char *fn, int which) -{ - bwa_seqio_t *bs; - bam_header_t *h; - bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); - bs->is_bam = 1; - bs->which = which; - bs->fp = bam_open(fn, "r"); - if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); - h = bam_header_read(bs->fp); - bam_header_destroy(h); - return bs; -} - -bwa_seqio_t *bwa_seq_open(const char *fn) -{ - gzFile fp; - bwa_seqio_t *bs; - bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); - fp = xzopen(fn, "r"); - bs->ks = kseq_init(fp); - return bs; -} - -void bwa_seq_close(bwa_seqio_t *bs) -{ - if (bs == 0) return; - if (bs->is_bam) { - if (0 != bam_close(bs->fp)) err_fatal_simple("Error closing bam file"); - } else { - err_gzclose(bs->ks->f->f); - kseq_destroy(bs->ks); - } - free(bs); -} - -void seq_reverse(int len, ubyte_t *seq, int is_comp) -{ - int i; - if (is_comp) { - for (i = 0; i < len>>1; ++i) { - char tmp = seq[len-1-i]; - if (tmp < 4) tmp = 3 - tmp; - seq[len-1-i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; - seq[i] = tmp; - } - if (len&1) seq[i] = (seq[i] >= 4)? seq[i] : 3 - seq[i]; - } else { - for (i = 0; i < len>>1; ++i) { - char tmp = seq[len-1-i]; - seq[len-1-i] = seq[i]; seq[i] = tmp; - } - } -} - -int bwa_trim_read(int trim_qual, bwa_seq_t *p) -{ - int s = 0, l, max = 0, max_l = p->len; - if (trim_qual < 1 || p->qual == 0) return 0; - for (l = p->len - 1; l >= BWA_MIN_RDLEN; --l) { - s += trim_qual - (p->qual[l] - 33); - if (s < 0) break; - if (s > max) max = s, max_l = l; - } - p->clip_len = p->len = max_l; - return p->full_len - p->len; -} - -static bwa_seq_t *bwa_read_bam(bwa_seqio_t *bs, int n_needed, int *n, int is_comp, int trim_qual) -{ - bwa_seq_t *seqs, *p; - int n_seqs, l, i; - long n_trimmed = 0, n_tot = 0; - bam1_t *b; - int res; - - b = bam_init1(); - n_seqs = 0; - seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); - while ((res = bam_read1(bs->fp, b)) >= 0) { - uint8_t *s, *q; - int go = 0; - if ((bs->which & 1) && (b->core.flag & BAM_FREAD1)) go = 1; - if ((bs->which & 2) && (b->core.flag & BAM_FREAD2)) go = 1; - if ((bs->which & 4) && !(b->core.flag& BAM_FREAD1) && !(b->core.flag& BAM_FREAD2))go = 1; - if (go == 0) continue; - l = b->core.l_qseq; - p = &seqs[n_seqs++]; - p->tid = -1; // no assigned to a thread - p->qual = 0; - p->full_len = p->clip_len = p->len = l; - n_tot += p->full_len; - s = bam1_seq(b); q = bam1_qual(b); - p->seq = (ubyte_t*)calloc(p->len + 1, 1); - p->qual = (ubyte_t*)calloc(p->len + 1, 1); - for (i = 0; i != p->full_len; ++i) { - p->seq[i] = bam_nt16_nt4_table[(int)bam1_seqi(s, i)]; - p->qual[i] = q[i] + 33 < 126? q[i] + 33 : 126; - } - if (bam1_strand(b)) { // then reverse - seq_reverse(p->len, p->seq, 1); - seq_reverse(p->len, p->qual, 0); - } - if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); - p->rseq = (ubyte_t*)calloc(p->full_len, 1); - memcpy(p->rseq, p->seq, p->len); - seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() - seq_reverse(p->len, p->rseq, is_comp); - p->name = strdup((const char*)bam1_qname(b)); - if (n_seqs == n_needed) break; - } - if (res < 0 && res != -1) err_fatal_simple("Error reading bam file"); - *n = n_seqs; - if (n_seqs && trim_qual >= 1) - fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); - if (n_seqs == 0) { - free(seqs); - bam_destroy1(b); - return 0; - } - bam_destroy1(b); - return seqs; -} - -#define BARCODE_LOW_QUAL 13 - -bwa_seq_t *bwa_read_seq(bwa_seqio_t *bs, int n_needed, int *n, int mode, int trim_qual) -{ - bwa_seq_t *seqs, *p; - kseq_t *seq = bs->ks; - int n_seqs, l, i, is_comp = mode&BWA_MODE_COMPREAD, is_64 = mode&BWA_MODE_IL13, l_bc = mode>>24; - long n_trimmed = 0, n_tot = 0; - - if (l_bc > BWA_MAX_BCLEN) { - fprintf(stderr, "[%s] the maximum barcode length is %d.\n", __func__, BWA_MAX_BCLEN); - return 0; - } - if (bs->is_bam) return bwa_read_bam(bs, n_needed, n, is_comp, trim_qual); // l_bc has no effect for BAM input - n_seqs = 0; - seqs = (bwa_seq_t*)calloc(n_needed, sizeof(bwa_seq_t)); - while ((l = kseq_read(seq)) >= 0) { - if ((mode & BWA_MODE_CFY) && (seq->comment.l != 0)) { - // skip reads that are marked to be filtered by Casava - char *s = index(seq->comment.s, ':'); - if (s && *(++s) == 'Y') { - continue; - } - } - if (is_64 && seq->qual.l) - for (i = 0; i < seq->qual.l; ++i) seq->qual.s[i] -= 31; - if (seq->seq.l <= l_bc) continue; // sequence length equals or smaller than the barcode length - p = &seqs[n_seqs++]; - if (l_bc) { // then trim barcode - for (i = 0; i < l_bc; ++i) - p->bc[i] = (seq->qual.l && seq->qual.s[i]-33 < BARCODE_LOW_QUAL)? tolower(seq->seq.s[i]) : toupper(seq->seq.s[i]); - p->bc[i] = 0; - for (; i < seq->seq.l; ++i) - seq->seq.s[i - l_bc] = seq->seq.s[i]; - seq->seq.l -= l_bc; seq->seq.s[seq->seq.l] = 0; - if (seq->qual.l) { - for (i = l_bc; i < seq->qual.l; ++i) - seq->qual.s[i - l_bc] = seq->qual.s[i]; - seq->qual.l -= l_bc; seq->qual.s[seq->qual.l] = 0; - } - l = seq->seq.l; - } else p->bc[0] = 0; - p->tid = -1; // no assigned to a thread - p->qual = 0; - p->full_len = p->clip_len = p->len = l; - n_tot += p->full_len; - p->seq = (ubyte_t*)calloc(p->full_len, 1); - for (i = 0; i != p->full_len; ++i) - p->seq[i] = nst_nt4_table[(int)seq->seq.s[i]]; - if (seq->qual.l) { // copy quality - p->qual = (ubyte_t*)strdup((char*)seq->qual.s); - if (trim_qual >= 1) n_trimmed += bwa_trim_read(trim_qual, p); - } - p->rseq = (ubyte_t*)calloc(p->full_len, 1); - memcpy(p->rseq, p->seq, p->len); - seq_reverse(p->len, p->seq, 0); // *IMPORTANT*: will be reversed back in bwa_refine_gapped() - seq_reverse(p->len, p->rseq, is_comp); - p->name = strdup((const char*)seq->name.s); - { // trim /[12]$ - int t = strlen(p->name); - if (t > 2 && p->name[t-2] == '/' && (p->name[t-1] == '1' || p->name[t-1] == '2')) p->name[t-2] = '\0'; - } - if (n_seqs == n_needed) break; - } - *n = n_seqs; - if (n_seqs && trim_qual >= 1) - fprintf(stderr, "[bwa_read_seq] %.1f%% bases are trimmed.\n", 100.0f * n_trimmed/n_tot); - if (n_seqs == 0) { - free(seqs); - return 0; - } - return seqs; -} - -void bwa_free_read_seq(int n_seqs, bwa_seq_t *seqs) -{ - int i, j; - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - for (j = 0; j < p->n_multi; ++j) - if (p->multi[j].cigar) free(p->multi[j].cigar); - free(p->name); - free(p->seq); free(p->rseq); free(p->qual); free(p->aln); free(p->md); free(p->multi); - free(p->cigar); - } - free(seqs); -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwt.c --- a/bwa-0.7.9a/bwt.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,437 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include -#include "utils.h" -#include "bwt.h" -#include "kvec.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -void bwt_gen_cnt_table(bwt_t *bwt) -{ - int i, j; - for (i = 0; i != 256; ++i) { - uint32_t x = 0; - for (j = 0; j != 4; ++j) - x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); - bwt->cnt_table[i] = x; - } -} - -static inline bwtint_t bwt_invPsi(const bwt_t *bwt, bwtint_t k) // compute inverse CSA -{ - bwtint_t x = k - (k > bwt->primary); - x = bwt_B0(bwt, x); - x = bwt->L2[x] + bwt_occ(bwt, k, x); - return k == bwt->primary? 0 : x; -} - -// bwt->bwt and bwt->occ must be precalculated -void bwt_cal_sa(bwt_t *bwt, int intv) -{ - bwtint_t isa, sa, i; // S(isa) = sa - int intv_round = intv; - - kv_roundup32(intv_round); - xassert(intv_round == intv, "SA sample interval is not a power of 2."); - xassert(bwt->bwt, "bwt_t::bwt is not initialized."); - - if (bwt->sa) free(bwt->sa); - bwt->sa_intv = intv; - bwt->n_sa = (bwt->seq_len + intv) / intv; - bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); - // calculate SA value - isa = 0; sa = bwt->seq_len; - for (i = 0; i < bwt->seq_len; ++i) { - if (isa % intv == 0) bwt->sa[isa/intv] = sa; - --sa; - isa = bwt_invPsi(bwt, isa); - } - if (isa % intv == 0) bwt->sa[isa/intv] = sa; - bwt->sa[0] = (bwtint_t)-1; // before this line, bwt->sa[0] = bwt->seq_len -} - -bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k) -{ - bwtint_t sa = 0, mask = bwt->sa_intv - 1; - while (k & mask) { - ++sa; - k = bwt_invPsi(bwt, k); - } - /* without setting bwt->sa[0] = -1, the following line should be - changed to (sa + bwt->sa[k/bwt->sa_intv]) % (bwt->seq_len + 1) */ - return sa + bwt->sa[k/bwt->sa_intv]; -} - -static inline int __occ_aux(uint64_t y, int c) -{ - // reduce nucleotide counting to bits counting - y = ((c&2)? y : ~y) >> 1 & ((c&1)? y : ~y) & 0x5555555555555555ull; - // count the number of 1s in y - y = (y & 0x3333333333333333ull) + (y >> 2 & 0x3333333333333333ull); - return ((y + (y >> 4)) & 0xf0f0f0f0f0f0f0full) * 0x101010101010101ull >> 56; -} - -bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c) -{ - bwtint_t n; - uint32_t *p, *end; - - if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; - if (k == (bwtint_t)(-1)) return 0; - k -= (k >= bwt->primary); // because $ is not in bwt - - // retrieve Occ at k/OCC_INTERVAL - n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; - p += sizeof(bwtint_t); // jump to the start of the first BWT cell - - // calculate Occ up to the last k/32 - end = p + (((k>>5) - ((k&~OCC_INTV_MASK)>>5))<<1); - for (; p < end; p += 2) n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); - - // calculate Occ - n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); - if (c == 0) n -= ~k&31; // corrected for the masked bits - - return n; -} - -// an analogy to bwt_occ() but more efficient, requiring k <= l -void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol) -{ - bwtint_t _k, _l; - _k = (k >= bwt->primary)? k-1 : k; - _l = (l >= bwt->primary)? l-1 : l; - if (_l/OCC_INTERVAL != _k/OCC_INTERVAL || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { - *ok = bwt_occ(bwt, k, c); - *ol = bwt_occ(bwt, l, c); - } else { - bwtint_t m, n, i, j; - uint32_t *p; - if (k >= bwt->primary) --k; - if (l >= bwt->primary) --l; - n = ((bwtint_t*)(p = bwt_occ_intv(bwt, k)))[c]; - p += sizeof(bwtint_t); - // calculate *ok - j = k >> 5 << 5; - for (i = k/OCC_INTERVAL*OCC_INTERVAL; i < j; i += 32, p += 2) - n += __occ_aux((uint64_t)p[0]<<32 | p[1], c); - m = n; - n += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~k&31)<<1)) - 1), c); - if (c == 0) n -= ~k&31; // corrected for the masked bits - *ok = n; - // calculate *ol - j = l >> 5 << 5; - for (; i < j; i += 32, p += 2) - m += __occ_aux((uint64_t)p[0]<<32 | p[1], c); - m += __occ_aux(((uint64_t)p[0]<<32 | p[1]) & ~((1ull<<((~l&31)<<1)) - 1), c); - if (c == 0) m -= ~l&31; // corrected for the masked bits - *ol = m; - } -} - -#define __occ_aux4(bwt, b) \ - ((bwt)->cnt_table[(b)&0xff] + (bwt)->cnt_table[(b)>>8&0xff] \ - + (bwt)->cnt_table[(b)>>16&0xff] + (bwt)->cnt_table[(b)>>24]) - -void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]) -{ - bwtint_t x; - uint32_t *p, tmp, *end; - if (k == (bwtint_t)(-1)) { - memset(cnt, 0, 4 * sizeof(bwtint_t)); - return; - } - k -= (k >= bwt->primary); // because $ is not in bwt - p = bwt_occ_intv(bwt, k); - memcpy(cnt, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) - end = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); // this is the end point of the following loop - for (x = 0; p < end; ++p) x += __occ_aux4(bwt, *p); - tmp = *p & ~((1U<<((~k&15)<<1)) - 1); - x += __occ_aux4(bwt, tmp) - (~k&15); - cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; -} - -// an analogy to bwt_occ4() but more efficient, requiring k <= l -void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]) -{ - bwtint_t _k, _l; - _k = k - (k >= bwt->primary); - _l = l - (l >= bwt->primary); - if (_l>>OCC_INTV_SHIFT != _k>>OCC_INTV_SHIFT || k == (bwtint_t)(-1) || l == (bwtint_t)(-1)) { - bwt_occ4(bwt, k, cntk); - bwt_occ4(bwt, l, cntl); - } else { - bwtint_t x, y; - uint32_t *p, tmp, *endk, *endl; - k -= (k >= bwt->primary); // because $ is not in bwt - l -= (l >= bwt->primary); - p = bwt_occ_intv(bwt, k); - memcpy(cntk, p, 4 * sizeof(bwtint_t)); - p += sizeof(bwtint_t); // sizeof(bwtint_t) = 4*(sizeof(bwtint_t)/sizeof(uint32_t)) - // prepare cntk[] - endk = p + ((k>>4) - ((k&~OCC_INTV_MASK)>>4)); - endl = p + ((l>>4) - ((l&~OCC_INTV_MASK)>>4)); - for (x = 0; p < endk; ++p) x += __occ_aux4(bwt, *p); - y = x; - tmp = *p & ~((1U<<((~k&15)<<1)) - 1); - x += __occ_aux4(bwt, tmp) - (~k&15); - // calculate cntl[] and finalize cntk[] - for (; p < endl; ++p) y += __occ_aux4(bwt, *p); - tmp = *p & ~((1U<<((~l&15)<<1)) - 1); - y += __occ_aux4(bwt, tmp) - (~l&15); - memcpy(cntl, cntk, 4 * sizeof(bwtint_t)); - cntk[0] += x&0xff; cntk[1] += x>>8&0xff; cntk[2] += x>>16&0xff; cntk[3] += x>>24; - cntl[0] += y&0xff; cntl[1] += y>>8&0xff; cntl[2] += y>>16&0xff; cntl[3] += y>>24; - } -} - -int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end) -{ - bwtint_t k, l, ok, ol; - int i; - k = 0; l = bwt->seq_len; - for (i = len - 1; i >= 0; --i) { - ubyte_t c = str[i]; - if (c > 3) return 0; // no match - bwt_2occ(bwt, k - 1, l, c, &ok, &ol); - k = bwt->L2[c] + ok + 1; - l = bwt->L2[c] + ol; - if (k > l) break; // no match - } - if (k > l) return 0; // no match - if (sa_begin) *sa_begin = k; - if (sa_end) *sa_end = l; - return l - k + 1; -} - -int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0) -{ - int i; - bwtint_t k, l, ok, ol; - k = *k0; l = *l0; - for (i = len - 1; i >= 0; --i) { - ubyte_t c = str[i]; - if (c > 3) return 0; // there is an N here. no match - bwt_2occ(bwt, k - 1, l, c, &ok, &ol); - k = bwt->L2[c] + ok + 1; - l = bwt->L2[c] + ol; - if (k > l) return 0; // no match - } - *k0 = k; *l0 = l; - return l - k + 1; -} - -/********************* - * Bidirectional BWT * - *********************/ - -void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back) -{ - bwtint_t tk[4], tl[4]; - int i; - bwt_2occ4(bwt, ik->x[!is_back] - 1, ik->x[!is_back] - 1 + ik->x[2], tk, tl); - for (i = 0; i != 4; ++i) { - ok[i].x[!is_back] = bwt->L2[i] + 1 + tk[i]; - ok[i].x[2] = tl[i] - tk[i]; - } - ok[3].x[is_back] = ik->x[is_back] + (ik->x[!is_back] <= bwt->primary && ik->x[!is_back] + ik->x[2] - 1 >= bwt->primary); - ok[2].x[is_back] = ok[3].x[is_back] + ok[3].x[2]; - ok[1].x[is_back] = ok[2].x[is_back] + ok[2].x[2]; - ok[0].x[is_back] = ok[1].x[is_back] + ok[1].x[2]; -} - -static void bwt_reverse_intvs(bwtintv_v *p) -{ - if (p->n > 1) { - int j; - for (j = 0; j < p->n>>1; ++j) { - bwtintv_t tmp = p->a[p->n - 1 - j]; - p->a[p->n - 1 - j] = p->a[j]; - p->a[j] = tmp; - } - } -} - -int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]) -{ - int i, j, c, ret; - bwtintv_t ik, ok[4]; - bwtintv_v a[2], *prev, *curr, *swap; - - mem->n = 0; - if (q[x] > 3) return x + 1; - if (min_intv < 1) min_intv = 1; // the interval size should be at least 1 - kv_init(a[0]); kv_init(a[1]); - prev = tmpvec && tmpvec[0]? tmpvec[0] : &a[0]; // use the temporary vector if provided - curr = tmpvec && tmpvec[1]? tmpvec[1] : &a[1]; - bwt_set_intv(bwt, q[x], ik); // the initial interval of a single base - ik.info = x + 1; - - for (i = x + 1, curr->n = 0; i < len; ++i) { // forward search - if (q[i] < 4) { // an A/C/G/T base - c = 3 - q[i]; // complement of q[i] - bwt_extend(bwt, &ik, ok, 0); - if (ok[c].x[2] != ik.x[2]) { // change of the interval size - kv_push(bwtintv_t, *curr, ik); - if (ok[c].x[2] < min_intv) break; // the interval size is too small to be extended further - } - ik = ok[c]; ik.info = i + 1; - } else { // an ambiguous base - kv_push(bwtintv_t, *curr, ik); - break; // always terminate extension at an ambiguous base; in this case, ia[0].info; // this will be the returned value - swap = curr; curr = prev; prev = swap; - - for (i = x - 1; i >= -1; --i) { // backward search for MEMs - c = i < 0? -1 : q[i] < 4? q[i] : -1; // c==-1 if i<0 or q[i] is an ambiguous base - for (j = 0, curr->n = 0; j < prev->n; ++j) { - bwtintv_t *p = &prev->a[j]; - bwt_extend(bwt, p, ok, 1); - if (c < 0 || ok[c].x[2] < min_intv) { // keep the hit if reaching the beginning or an ambiguous base or the intv is small enough - if (curr->n == 0) { // test curr->n>0 to make sure there are no longer matches - if (mem->n == 0 || i + 1 < mem->a[mem->n-1].info>>32) { // skip contained matches - ik = *p; ik.info |= (uint64_t)(i + 1)<<32; - kv_push(bwtintv_t, *mem, ik); - } - } // otherwise the match is contained in another longer match - } else if (curr->n == 0 || ok[c].x[2] != curr->a[curr->n-1].x[2]) { - ok[c].info = p->info; - kv_push(bwtintv_t, *curr, ok[c]); - } - } - if (curr->n == 0) break; - swap = curr; curr = prev; prev = swap; - } - bwt_reverse_intvs(mem); // s.t. sorted by the start coordinate - - if (tmpvec == 0 || tmpvec[0] == 0) free(a[0].a); - if (tmpvec == 0 || tmpvec[1] == 0) free(a[1].a); - return ret; -} - -/************************* - * Read/write BWT and SA * - *************************/ - -void bwt_dump_bwt(const char *fn, const bwt_t *bwt) -{ - FILE *fp; - fp = xopen(fn, "wb"); - err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - err_fwrite(bwt->bwt, 4, bwt->bwt_size, fp); - err_fflush(fp); - err_fclose(fp); -} - -void bwt_dump_sa(const char *fn, const bwt_t *bwt) -{ - FILE *fp; - fp = xopen(fn, "wb"); - err_fwrite(&bwt->primary, sizeof(bwtint_t), 1, fp); - err_fwrite(bwt->L2+1, sizeof(bwtint_t), 4, fp); - err_fwrite(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - err_fwrite(&bwt->seq_len, sizeof(bwtint_t), 1, fp); - err_fwrite(bwt->sa + 1, sizeof(bwtint_t), bwt->n_sa - 1, fp); - err_fflush(fp); - err_fclose(fp); -} - -static bwtint_t fread_fix(FILE *fp, bwtint_t size, void *a) -{ // Mac/Darwin has a bug when reading data longer than 2GB. This function fixes this issue by reading data in small chunks - const int bufsize = 0x1000000; // 16M block - bwtint_t offset = 0; - while (size) { - int x = bufsize < size? bufsize : size; - if ((x = err_fread_noeof(a + offset, 1, x, fp)) == 0) break; - size -= x; offset += x; - } - return offset; -} - -void bwt_restore_sa(const char *fn, bwt_t *bwt) -{ - char skipped[256]; - FILE *fp; - bwtint_t primary; - - fp = xopen(fn, "rb"); - err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->primary, "SA-BWT inconsistency: primary is not the same."); - err_fread_noeof(skipped, sizeof(bwtint_t), 4, fp); // skip - err_fread_noeof(&bwt->sa_intv, sizeof(bwtint_t), 1, fp); - err_fread_noeof(&primary, sizeof(bwtint_t), 1, fp); - xassert(primary == bwt->seq_len, "SA-BWT inconsistency: seq_len is not the same."); - - bwt->n_sa = (bwt->seq_len + bwt->sa_intv) / bwt->sa_intv; - bwt->sa = (bwtint_t*)calloc(bwt->n_sa, sizeof(bwtint_t)); - bwt->sa[0] = -1; - - fread_fix(fp, sizeof(bwtint_t) * (bwt->n_sa - 1), bwt->sa + 1); - err_fclose(fp); -} - -bwt_t *bwt_restore_bwt(const char *fn) -{ - bwt_t *bwt; - FILE *fp; - - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); - fp = xopen(fn, "rb"); - err_fseek(fp, 0, SEEK_END); - bwt->bwt_size = (err_ftell(fp) - sizeof(bwtint_t) * 5) >> 2; - bwt->bwt = (uint32_t*)calloc(bwt->bwt_size, 4); - err_fseek(fp, 0, SEEK_SET); - err_fread_noeof(&bwt->primary, sizeof(bwtint_t), 1, fp); - err_fread_noeof(bwt->L2+1, sizeof(bwtint_t), 4, fp); - fread_fix(fp, bwt->bwt_size<<2, bwt->bwt); - bwt->seq_len = bwt->L2[4]; - err_fclose(fp); - bwt_gen_cnt_table(bwt); - - return bwt; -} - -void bwt_destroy(bwt_t *bwt) -{ - if (bwt == 0) return; - free(bwt->sa); free(bwt->bwt); - free(bwt); -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwt.h --- a/bwa-0.7.9a/bwt.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,128 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef BWA_BWT_H -#define BWA_BWT_H - -#include -#include - -// requirement: (OCC_INTERVAL%16 == 0); please DO NOT change this line because some part of the code assume OCC_INTERVAL=0x80 -#define OCC_INTV_SHIFT 7 -#define OCC_INTERVAL (1LL<bwt[(k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) + sizeof(bwtint_t)/4*4 + (k)%OCC_INTERVAL/16]) -#define bwt_occ_intv(b, k) ((b)->bwt + (k)/OCC_INTERVAL * (OCC_INTERVAL/(sizeof(uint32_t)*8/2) + sizeof(bwtint_t)/4*4) -*/ - -// The following two lines are ONLY correct when OCC_INTERVAL==0x80 -#define bwt_bwt(b, k) ((b)->bwt[((k)>>7<<4) + sizeof(bwtint_t) + (((k)&0x7f)>>4)]) -#define bwt_occ_intv(b, k) ((b)->bwt + ((k)>>7<<4)) - -/* retrieve a character from the $-removed BWT string. Note that - * bwt_t::bwt is not exactly the BWT string and therefore this macro is - * called bwt_B0 instead of bwt_B */ -#define bwt_B0(b, k) (bwt_bwt(b, k)>>((~(k)&0xf)<<1)&3) - -#define bwt_set_intv(bwt, c, ik) ((ik).x[0] = (bwt)->L2[(int)(c)]+1, (ik).x[2] = (bwt)->L2[(int)(c)+1]-(bwt)->L2[(int)(c)], (ik).x[1] = (bwt)->L2[3-(c)]+1, (ik).info = 0) - -#ifdef __cplusplus -extern "C" { -#endif - - void bwt_dump_bwt(const char *fn, const bwt_t *bwt); - void bwt_dump_sa(const char *fn, const bwt_t *bwt); - - bwt_t *bwt_restore_bwt(const char *fn); - void bwt_restore_sa(const char *fn, bwt_t *bwt); - - void bwt_destroy(bwt_t *bwt); - - void bwt_bwtgen(const char *fn_pac, const char *fn_bwt); // from BWT-SW - void bwt_cal_sa(bwt_t *bwt, int intv); - - void bwt_bwtupdate_core(bwt_t *bwt); - - bwtint_t bwt_occ(const bwt_t *bwt, bwtint_t k, ubyte_t c); - void bwt_occ4(const bwt_t *bwt, bwtint_t k, bwtint_t cnt[4]); - bwtint_t bwt_sa(const bwt_t *bwt, bwtint_t k); - - // more efficient version of bwt_occ/bwt_occ4 for retrieving two close Occ values - void bwt_gen_cnt_table(bwt_t *bwt); - void bwt_2occ(const bwt_t *bwt, bwtint_t k, bwtint_t l, ubyte_t c, bwtint_t *ok, bwtint_t *ol); - void bwt_2occ4(const bwt_t *bwt, bwtint_t k, bwtint_t l, bwtint_t cntk[4], bwtint_t cntl[4]); - - int bwt_match_exact(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *sa_begin, bwtint_t *sa_end); - int bwt_match_exact_alt(const bwt_t *bwt, int len, const ubyte_t *str, bwtint_t *k0, bwtint_t *l0); - - /** - * Extend bi-SA-interval _ik_ - */ - void bwt_extend(const bwt_t *bwt, const bwtintv_t *ik, bwtintv_t ok[4], int is_back); - - /** - * Given a query _q_, collect potential SMEMs covering position _x_ and store them in _mem_. - * Return the end of the longest exact match starting from _x_. - */ - int bwt_smem1(const bwt_t *bwt, int len, const uint8_t *q, int x, int min_intv, bwtintv_v *mem, bwtintv_v *tmpvec[2]); - - // SMEM iterator interface - -#ifdef __cplusplus -} -#endif - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwt_gen.c --- a/bwa-0.7.9a/bwt_gen.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,1627 +0,0 @@ -/* - - BWTConstruct.c BWT-Index Construction - - This module constructs BWT and auxiliary data structures. - - Copyright (C) 2004, Wong Chi Kwong. - - This program is free software; you can redistribute it and/or - modify it under the terms of the GNU General Public License - as published by the Free Software Foundation; either version 2 - of the License, or (at your option) any later version. - - This program is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with this program; if not, write to the Free Software - Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - -*/ - -#include -#include -#include -#include -#include -#include -#include "QSufSort.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -typedef uint64_t bgint_t; -typedef int64_t sbgint_t; - -#define ALPHABET_SIZE 4 -#define BIT_PER_CHAR 2 -#define CHAR_PER_WORD 16 -#define CHAR_PER_BYTE 4 - -#define BITS_IN_WORD 32 -#define BITS_IN_BYTE 8 -#define BYTES_IN_WORD 4 - -#define ALL_ONE_MASK 0xFFFFFFFF -#define DNA_OCC_CNT_TABLE_SIZE_IN_WORD 65536 - -#define BITS_PER_OCC_VALUE 16 -#define OCC_VALUE_PER_WORD 2 -#define OCC_INTERVAL 256 -#define OCC_INTERVAL_MAJOR 65536 - -#define TRUE 1 -#define FALSE 0 - -#define BWTINC_INSERT_SORT_NUM_ITEM 7 - -#define MIN_AVAILABLE_WORD 0x10000 - -#define average(value1, value2) ( ((value1) & (value2)) + ((value1) ^ (value2)) / 2 ) -#define min(value1, value2) ( ((value1) < (value2)) ? (value1) : (value2) ) -#define max(value1, value2) ( ((value1) > (value2)) ? (value1) : (value2) ) -#define med3(a, b, c) ( ac ? b : a>c ? c : a)) -#define swap(a, b, t); t = a; a = b; b = t; -#define truncateLeft(value, offset) ( (value) << (offset) >> (offset) ) -#define truncateRight(value, offset) ( (value) >> (offset) << (offset) ) -#define DNA_OCC_SUM_EXCEPTION(sum) ((sum & 0xfefefeff) == 0) - -typedef struct BWT { - bgint_t textLength; // length of the text - bgint_t inverseSa0; // SA-1[0] - bgint_t *cumulativeFreq; // cumulative frequency - unsigned int *bwtCode; // BWT code - unsigned int *occValue; // Occurrence values stored explicitly - bgint_t *occValueMajor; // Occurrence values stored explicitly - unsigned int *decodeTable; // For decoding BWT by table lookup - bgint_t bwtSizeInWord; // Temporary variable to hold the memory allocated - bgint_t occSizeInWord; // Temporary variable to hold the memory allocated - bgint_t occMajorSizeInWord; // Temporary variable to hold the memory allocated -} BWT; - -typedef struct BWTInc { - BWT *bwt; - unsigned int numberOfIterationDone; - bgint_t *cumulativeCountInCurrentBuild; - bgint_t availableWord; - bgint_t buildSize; - bgint_t initialMaxBuildSize; - bgint_t incMaxBuildSize; - unsigned int firstCharInLastIteration; - unsigned int *workingMemory; - unsigned int *packedText; - unsigned char *textBuffer; - unsigned int *packedShift; -} BWTInc; - -static bgint_t TextLengthFromBytePacked(bgint_t bytePackedLength, unsigned int bitPerChar, - unsigned int lastByteLength) -{ - return (bytePackedLength - 1) * (BITS_IN_BYTE / bitPerChar) + lastByteLength; -} - -static void initializeVAL(unsigned int *startAddr, const bgint_t length, const unsigned int initValue) -{ - bgint_t i; - for (i=0; i>= 2; - } - } - -} -// for BWTIncCreate() -static bgint_t BWTOccValueMajorSizeInWord(const bgint_t numChar) -{ - bgint_t numOfOccValue; - unsigned numOfOccIntervalPerMajor; - numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding - numOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; - return (numOfOccValue + numOfOccIntervalPerMajor - 1) / numOfOccIntervalPerMajor * ALPHABET_SIZE; -} -// for BWTIncCreate() -static bgint_t BWTOccValueMinorSizeInWord(const bgint_t numChar) -{ - bgint_t numOfOccValue; - numOfOccValue = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding - return (numOfOccValue + OCC_VALUE_PER_WORD - 1) / OCC_VALUE_PER_WORD * ALPHABET_SIZE; -} -// for BWTIncCreate() -static bgint_t BWTResidentSizeInWord(const bgint_t numChar) { - - bgint_t numCharRoundUpToOccInterval; - - // The $ in BWT at the position of inverseSa0 is not encoded - numCharRoundUpToOccInterval = (numChar + OCC_INTERVAL - 1) / OCC_INTERVAL * OCC_INTERVAL; - - return (numCharRoundUpToOccInterval + CHAR_PER_WORD - 1) / CHAR_PER_WORD; - -} - -static void BWTIncSetBuildSizeAndTextAddr(BWTInc *bwtInc) -{ - bgint_t maxBuildSize; - - if (bwtInc->bwt->textLength == 0) { - // initial build - // Minus 2 because n+1 entries of seq and rank needed for n char - maxBuildSize = (bwtInc->availableWord - (2 + OCC_INTERVAL / CHAR_PER_WORD) * (sizeof(bgint_t) / 4)) - / (2 * CHAR_PER_WORD + 1) * CHAR_PER_WORD / (sizeof(bgint_t) / 4); - if (bwtInc->initialMaxBuildSize > 0) { - bwtInc->buildSize = min(bwtInc->initialMaxBuildSize, maxBuildSize); - } else { - bwtInc->buildSize = maxBuildSize; - } - } else { - // Minus 3 because n+1 entries of sorted rank, seq and rank needed for n char - // Minus numberOfIterationDone because bwt slightly shift to left in each iteration - maxBuildSize = (bwtInc->availableWord - bwtInc->bwt->bwtSizeInWord - bwtInc->bwt->occSizeInWord - - (3 + bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR) * (sizeof(bgint_t) / 4)) - / 3 / (sizeof(bgint_t) / 4); - if (maxBuildSize < CHAR_PER_WORD) { - fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); - exit(1); - } - if (bwtInc->incMaxBuildSize > 0) { - bwtInc->buildSize = min(bwtInc->incMaxBuildSize, maxBuildSize); - } else { - bwtInc->buildSize = maxBuildSize; - } - if (bwtInc->buildSize < CHAR_PER_WORD) - bwtInc->buildSize = CHAR_PER_WORD; - } - - if (bwtInc->buildSize < CHAR_PER_WORD) { - fprintf(stderr, "BWTIncSetBuildSizeAndTextAddr(): Not enough space allocated to continue construction!\n"); - exit(1); - } - - bwtInc->buildSize = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; - - bwtInc->packedText = bwtInc->workingMemory + 2 * (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4); - bwtInc->textBuffer = (unsigned char*)(bwtInc->workingMemory + (bwtInc->buildSize + 1) * (sizeof(bgint_t) / 4)); -} - -// for ceilLog2() -unsigned int leadingZero(const unsigned int input) -{ - unsigned int l; - const static unsigned int leadingZero8bit[256] = {8,7,6,6,5,5,5,5,4,4,4,4,4,4,4,4,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, - 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; - - if (input & 0xFFFF0000) { - if (input & 0xFF000000) { - l = leadingZero8bit[input >> 24]; - } else { - l = 8 + leadingZero8bit[input >> 16]; - } - } else { - if (input & 0x0000FF00) { - l = 16 + leadingZero8bit[input >> 8]; - } else { - l = 24 + leadingZero8bit[input]; - } - } - return l; - -} -// for BitPerBytePackedChar() -static unsigned int ceilLog2(const unsigned int input) -{ - if (input <= 1) return 0; - return BITS_IN_WORD - leadingZero(input - 1); - -} -// for ConvertBytePackedToWordPacked() -static unsigned int BitPerBytePackedChar(const unsigned int alphabetSize) -{ - unsigned int bitPerChar; - bitPerChar = ceilLog2(alphabetSize); - // Return the largest number of bit that does not affect packing efficiency - if (BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar) > bitPerChar) - bitPerChar = BITS_IN_BYTE / (BITS_IN_BYTE / bitPerChar); - return bitPerChar; -} -// for ConvertBytePackedToWordPacked() -static unsigned int BitPerWordPackedChar(const unsigned int alphabetSize) -{ - return ceilLog2(alphabetSize); -} - -static void ConvertBytePackedToWordPacked(const unsigned char *input, unsigned int *output, const unsigned int alphabetSize, - const bgint_t textLength) -{ - bgint_t i; - unsigned int j, k, c; - unsigned int bitPerBytePackedChar; - unsigned int bitPerWordPackedChar; - unsigned int charPerWord; - unsigned int charPerByte; - unsigned int bytePerIteration; - bgint_t byteProcessed = 0; - bgint_t wordProcessed = 0; - unsigned int mask, shift; - - unsigned int buffer[BITS_IN_WORD]; - - bitPerBytePackedChar = BitPerBytePackedChar(alphabetSize); - bitPerWordPackedChar = BitPerWordPackedChar(alphabetSize); - charPerByte = BITS_IN_BYTE / bitPerBytePackedChar; - charPerWord = BITS_IN_WORD / bitPerWordPackedChar; - - bytePerIteration = charPerWord / charPerByte; - mask = truncateRight(ALL_ONE_MASK, BITS_IN_WORD - bitPerWordPackedChar); - shift = BITS_IN_WORD - BITS_IN_BYTE + bitPerBytePackedChar - bitPerWordPackedChar; - - while ((wordProcessed + 1) * charPerWord < textLength) { - - k = 0; - for (i=0; i> bitPerWordPackedChar * i; - } - output[wordProcessed] = c; - wordProcessed++; - - } - - k = 0; - for (i=0; i < (textLength - wordProcessed * charPerWord - 1) / charPerByte + 1; i++) { - c = (unsigned int)input[byteProcessed] << shift; - for (j=0; j> bitPerWordPackedChar * i; - } - output[wordProcessed] = c; -} - -BWT *BWTCreate(const bgint_t textLength, unsigned int *decodeTable) -{ - BWT *bwt; - - bwt = (BWT*)calloc(1, sizeof(BWT)); - - bwt->textLength = 0; - - bwt->cumulativeFreq = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); - initializeVAL_bg(bwt->cumulativeFreq, ALPHABET_SIZE + 1, 0); - - bwt->bwtSizeInWord = 0; - - // Generate decode tables - if (decodeTable == NULL) { - bwt->decodeTable = (unsigned*)calloc(DNA_OCC_CNT_TABLE_SIZE_IN_WORD, sizeof(unsigned int)); - GenerateDNAOccCountTable(bwt->decodeTable); - } else { - bwt->decodeTable = decodeTable; - } - - bwt->occMajorSizeInWord = BWTOccValueMajorSizeInWord(textLength); - bwt->occValueMajor = (bgint_t*)calloc(bwt->occMajorSizeInWord, sizeof(bgint_t)); - - bwt->occSizeInWord = 0; - bwt->occValue = NULL; - - return bwt; -} - -BWTInc *BWTIncCreate(const bgint_t textLength, unsigned int initialMaxBuildSize, unsigned int incMaxBuildSize) -{ - BWTInc *bwtInc; - unsigned int i, n_iter; - - if (textLength < incMaxBuildSize) incMaxBuildSize = textLength; - if (textLength < initialMaxBuildSize) initialMaxBuildSize = textLength; - - bwtInc = (BWTInc*)calloc(1, sizeof(BWTInc)); - bwtInc->numberOfIterationDone = 0; - bwtInc->bwt = BWTCreate(textLength, NULL); - bwtInc->initialMaxBuildSize = initialMaxBuildSize; - bwtInc->incMaxBuildSize = incMaxBuildSize; - bwtInc->cumulativeCountInCurrentBuild = (bgint_t*)calloc((ALPHABET_SIZE + 1), sizeof(bgint_t)); - initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); - - // Build frequently accessed data - bwtInc->packedShift = (unsigned*)calloc(CHAR_PER_WORD, sizeof(unsigned int)); - for (i=0; ipackedShift[i] = BITS_IN_WORD - (i+1) * BIT_PER_CHAR; - - n_iter = (textLength - initialMaxBuildSize) / incMaxBuildSize + 1; - bwtInc->availableWord = BWTResidentSizeInWord(textLength) + BWTOccValueMinorSizeInWord(textLength) // minimal memory requirement - + OCC_INTERVAL / BIT_PER_CHAR * n_iter * 2 * (sizeof(bgint_t) / 4) // buffer at the end of occ array - + incMaxBuildSize/5 * 3 * (sizeof(bgint_t) / 4); // space for the 3 temporary arrays in each iteration - if (bwtInc->availableWord < MIN_AVAILABLE_WORD) bwtInc->availableWord = MIN_AVAILABLE_WORD; // lh3: otherwise segfaul when availableWord is too small - fprintf(stderr, "[%s] textLength=%ld, availableWord=%ld\n", __func__, (long)textLength, (long)bwtInc->availableWord); - bwtInc->workingMemory = (unsigned*)calloc(bwtInc->availableWord, BYTES_IN_WORD); - - return bwtInc; -} -// for BWTIncConstruct() -static void BWTIncPutPackedTextToRank(const unsigned int *packedText, bgint_t* __restrict rank, - bgint_t* __restrict cumulativeCount, const bgint_t numChar) -{ - bgint_t i; - unsigned int j; - unsigned int c, t; - unsigned int packedMask; - bgint_t rankIndex; - bgint_t lastWord; - unsigned int numCharInLastWord; - - lastWord = (numChar - 1) / CHAR_PER_WORD; - numCharInLastWord = numChar - lastWord * CHAR_PER_WORD; - - packedMask = ALL_ONE_MASK >> (BITS_IN_WORD - BIT_PER_CHAR); - rankIndex = numChar - 1; - - t = packedText[lastWord] >> (BITS_IN_WORD - numCharInLastWord * BIT_PER_CHAR); - for (i=0; i>= BIT_PER_CHAR; - } - - for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 - t = packedText[i]; - for (j=0; j>= BIT_PER_CHAR; - } - } - - // Convert occurrence to cumulativeCount - cumulativeCount[2] += cumulativeCount[1]; - cumulativeCount[3] += cumulativeCount[2]; - cumulativeCount[4] += cumulativeCount[3]; -} - - -static void ForwardDNAAllOccCountNoLimit(const unsigned int* dna, const bgint_t index, - bgint_t* __restrict occCount, const unsigned int* dnaDecodeTable) -{ - static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, - 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, - 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, - 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; - - bgint_t iteration, i; - unsigned int wordToCount, charToCount; - unsigned int j, c, sum; - - occCount[0] = 0; - occCount[1] = 0; - occCount[2] = 0; - occCount[3] = 0; - - iteration = index / 256; - wordToCount = (index - iteration * 256) / 16; - charToCount = index - iteration * 256 - wordToCount * 16; - - for (i=0; i> 16]; - sum += dnaDecodeTable[*dna & 0x0000FFFF]; - dna++; - } - if (!DNA_OCC_SUM_EXCEPTION(sum)) { - occCount[0] += sum & 0x000000FF; sum >>= 8; - occCount[1] += sum & 0x000000FF; sum >>= 8; - occCount[2] += sum & 0x000000FF; sum >>= 8; - occCount[3] += sum; - } else { - // only some or all of the 3 bits are on - // in reality, only one of the four cases are possible - if (sum == 0x00000100) { - occCount[0] += 256; - } else if (sum == 0x00010000) { - occCount[1] += 256; - } else if (sum == 0x01000000) { - occCount[2] += 256; - } else if (sum == 0x00000000) { - occCount[3] += 256; - } else { - fprintf(stderr, "ForwardDNAAllOccCountNoLimit(): DNA occ sum exception!\n"); - exit(1); - } - } - - } - - sum = 0; - for (j=0; j> 16]; - sum += dnaDecodeTable[*dna & 0x0000FFFF]; - dna++; - } - - if (charToCount > 0) { - c = *dna & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; - sum += dnaDecodeTable[c >> 16]; - sum += dnaDecodeTable[c & 0xFFFF]; - sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess - } - - occCount[0] += sum & 0x000000FF; sum >>= 8; - occCount[1] += sum & 0x000000FF; sum >>= 8; - occCount[2] += sum & 0x000000FF; sum >>= 8; - occCount[3] += sum; -} - -static void BWTIncBuildPackedBwt(const bgint_t *relativeRank, unsigned int* __restrict bwt, const bgint_t numChar, - const bgint_t *cumulativeCount, const unsigned int *packedShift) { - - bgint_t i, r; - unsigned int c; - bgint_t previousRank, currentRank; - bgint_t wordIndex, charIndex; - bgint_t inverseSa0; - - inverseSa0 = previousRank = relativeRank[0]; - - for (i=1; i<=numChar; i++) { - currentRank = relativeRank[i]; - // previousRank > cumulativeCount[c] because $ is one of the char - c = (previousRank > cumulativeCount[1]) + (previousRank > cumulativeCount[2]) - + (previousRank > cumulativeCount[3]); - // set bwt for currentRank - if (c > 0) { - // c <> 'a' - r = currentRank; - if (r > inverseSa0) { - // - 1 because $ at inverseSa0 is not encoded - r--; - } - wordIndex = r / CHAR_PER_WORD; - charIndex = r - wordIndex * CHAR_PER_WORD; - bwt[wordIndex] |= c << packedShift[charIndex]; - } - previousRank = currentRank; - } -} - -static inline bgint_t BWTOccValueExplicit(const BWT *bwt, const bgint_t occIndexExplicit, - const unsigned int character) -{ - bgint_t occIndexMajor; - - occIndexMajor = occIndexExplicit * OCC_INTERVAL / OCC_INTERVAL_MAJOR; - - if (occIndexExplicit % OCC_VALUE_PER_WORD == 0) { - return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + - (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] >> 16); - - } else { - return bwt->occValueMajor[occIndexMajor * ALPHABET_SIZE + character] + - (bwt->occValue[occIndexExplicit / OCC_VALUE_PER_WORD * ALPHABET_SIZE + character] & 0x0000FFFF); - } -} - - -static unsigned int ForwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, - const unsigned int* dnaDecodeTable) -{ - static const unsigned int truncateRightMask[16] = { 0x00000000, 0xC0000000, 0xF0000000, 0xFC000000, - 0xFF000000, 0xFFC00000, 0xFFF00000, 0xFFFC0000, - 0xFFFF0000, 0xFFFFC000, 0xFFFFF000, 0xFFFFFC00, - 0xFFFFFF00, 0xFFFFFFC0, 0xFFFFFFF0, 0xFFFFFFFC }; - - unsigned int wordToCount, charToCount; - unsigned int i, c; - unsigned int sum = 0; - - wordToCount = index / 16; - charToCount = index - wordToCount * 16; - - for (i=0; i> 16]; - sum += dnaDecodeTable[dna[i] & 0x0000FFFF]; - } - - if (charToCount > 0) { - c = dna[i] & truncateRightMask[charToCount]; // increase count of 'a' by 16 - c; - sum += dnaDecodeTable[c >> 16]; - sum += dnaDecodeTable[c & 0xFFFF]; - sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess - } - - return (sum >> (character * 8)) & 0x000000FF; - -} - -static unsigned int BackwardDNAOccCount(const unsigned int* dna, const unsigned int index, const unsigned int character, - const unsigned int* dnaDecodeTable) -{ - static const unsigned int truncateLeftMask[16] = { 0x00000000, 0x00000003, 0x0000000F, 0x0000003F, - 0x000000FF, 0x000003FF, 0x00000FFF, 0x00003FFF, - 0x0000FFFF, 0x0003FFFF, 0x000FFFFF, 0x003FFFFF, - 0x00FFFFFF, 0x03FFFFFF, 0x0FFFFFFF, 0x3FFFFFFF }; - - unsigned int wordToCount, charToCount; - unsigned int i, c; - unsigned int sum = 0; - - wordToCount = index / 16; - charToCount = index - wordToCount * 16; - - dna -= wordToCount + 1; - - if (charToCount > 0) { - c = *dna & truncateLeftMask[charToCount]; // increase count of 'a' by 16 - c; - sum += dnaDecodeTable[c >> 16]; - sum += dnaDecodeTable[c & 0xFFFF]; - sum += charToCount - 16; // decrease count of 'a' by 16 - positionToProcess - } - - for (i=0; i> 16]; - sum += dnaDecodeTable[*dna & 0x0000FFFF]; - } - - return (sum >> (character * 8)) & 0x000000FF; - -} - -bgint_t BWTOccValue(const BWT *bwt, bgint_t index, const unsigned int character) -{ - bgint_t occValue; - bgint_t occExplicitIndex, occIndex; - - // $ is supposed to be positioned at inverseSa0 but it is not encoded - // therefore index is subtracted by 1 for adjustment - if (index > bwt->inverseSa0) - index--; - - occExplicitIndex = (index + OCC_INTERVAL / 2 - 1) / OCC_INTERVAL; // Bidirectional encoding - occIndex = occExplicitIndex * OCC_INTERVAL; - occValue = BWTOccValueExplicit(bwt, occExplicitIndex, character); - - if (occIndex == index) - return occValue; - - if (occIndex < index) { - return occValue + ForwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, index - occIndex, character, bwt->decodeTable); - } else { - return occValue - BackwardDNAOccCount(bwt->bwtCode + occIndex / CHAR_PER_WORD, occIndex - index, character, bwt->decodeTable); - } -} - -static bgint_t BWTIncGetAbsoluteRank(BWT *bwt, bgint_t* __restrict absoluteRank, bgint_t* __restrict seq, - const unsigned int *packedText, const bgint_t numChar, - const bgint_t* cumulativeCount, const unsigned int firstCharInLastIteration) -{ - bgint_t saIndex; - bgint_t lastWord; - unsigned int packedMask; - bgint_t i; - unsigned int c, t, j; - bgint_t rankIndex; - unsigned int shift; - bgint_t seqIndexFromStart[ALPHABET_SIZE]; - bgint_t seqIndexFromEnd[ALPHABET_SIZE]; - - for (i=0; i> shift; - saIndex = bwt->inverseSa0; - rankIndex = numChar - 1; - - lastWord = numChar / CHAR_PER_WORD; - for (i=lastWord; i--;) { // loop from lastWord - 1 to 0 - t = packedText[i]; - for (j=0; jcumulativeFreq[c] + BWTOccValue(bwt, saIndex, c) + 1; - // A counting sort using the first character of suffix is done here - // If rank > inverseSa0 -> fill seq from end, otherwise fill seq from start -> to leave the right entry for inverseSa0 - if (saIndex > bwt->inverseSa0) { - seq[seqIndexFromEnd[c]] = rankIndex; - absoluteRank[seqIndexFromEnd[c]] = saIndex; - seqIndexFromEnd[c]--; - } else { - seq[seqIndexFromStart[c]] = rankIndex; - absoluteRank[seqIndexFromStart[c]] = saIndex; - seqIndexFromStart[c]++; - } - rankIndex--; - t >>= BIT_PER_CHAR; - } - } - - absoluteRank[seqIndexFromStart[firstCharInLastIteration]] = bwt->inverseSa0; // representing the substring of all preceding characters - seq[seqIndexFromStart[firstCharInLastIteration]] = numChar; - - return seqIndexFromStart[firstCharInLastIteration]; -} - -static void BWTIncSortKey(bgint_t* __restrict key, bgint_t* __restrict seq, const bgint_t numItem) -{ - #define EQUAL_KEY_THRESHOLD 4 // Partition for equal key if data array size / the number of data with equal value with pivot < EQUAL_KEY_THRESHOLD - - int64_t lowIndex, highIndex, midIndex; - int64_t lowPartitionIndex, highPartitionIndex; - int64_t lowStack[32], highStack[32]; - int stackDepth; - int64_t i, j; - bgint_t tempSeq, tempKey; - int64_t numberOfEqualKey; - - if (numItem < 2) return; - - stackDepth = 0; - - lowIndex = 0; - highIndex = numItem - 1; - - for (;;) { - - for (;;) { - - // Sort small array of data - if (highIndex - lowIndex < BWTINC_INSERT_SORT_NUM_ITEM) { // Insertion sort on smallest arrays - for (i=lowIndex+1; i<=highIndex; i++) { - tempSeq = seq[i]; - tempKey = key[i]; - for (j = i; j > lowIndex && key[j-1] > tempKey; j--) { - seq[j] = seq[j-1]; - key[j] = key[j-1]; - } - if (j != i) { - seq[j] = tempSeq; - key[j] = tempKey; - } - } - break; - } - - // Choose pivot as median of the lowest, middle, and highest data; sort the three data - - midIndex = average(lowIndex, highIndex); - if (key[lowIndex] > key[midIndex]) { - tempSeq = seq[lowIndex]; - tempKey = key[lowIndex]; - seq[lowIndex] = seq[midIndex]; - key[lowIndex] = key[midIndex]; - seq[midIndex] = tempSeq; - key[midIndex] = tempKey; - } - if (key[lowIndex] > key[highIndex]) { - tempSeq = seq[lowIndex]; - tempKey = key[lowIndex]; - seq[lowIndex] = seq[highIndex]; - key[lowIndex] = key[highIndex]; - seq[highIndex] = tempSeq; - key[highIndex] = tempKey; - } - if (key[midIndex] > key[highIndex]) { - tempSeq = seq[midIndex]; - tempKey = key[midIndex]; - seq[midIndex] = seq[highIndex]; - key[midIndex] = key[highIndex]; - seq[highIndex] = tempSeq; - key[highIndex] = tempKey; - } - - // Partition data - - numberOfEqualKey = 0; - - lowPartitionIndex = lowIndex + 1; - highPartitionIndex = highIndex - 1; - - for (;;) { - while (lowPartitionIndex <= highPartitionIndex && key[lowPartitionIndex] <= key[midIndex]) { - numberOfEqualKey += (key[lowPartitionIndex] == key[midIndex]); - lowPartitionIndex++; - } - while (lowPartitionIndex < highPartitionIndex) { - if (key[midIndex] >= key[highPartitionIndex]) { - numberOfEqualKey += (key[midIndex] == key[highPartitionIndex]); - break; - } - highPartitionIndex--; - } - if (lowPartitionIndex >= highPartitionIndex) { - break; - } - tempSeq = seq[lowPartitionIndex]; - tempKey = key[lowPartitionIndex]; - seq[lowPartitionIndex] = seq[highPartitionIndex]; - key[lowPartitionIndex] = key[highPartitionIndex]; - seq[highPartitionIndex] = tempSeq; - key[highPartitionIndex] = tempKey; - if (highPartitionIndex == midIndex) { - // partition key has been moved - midIndex = lowPartitionIndex; - } - lowPartitionIndex++; - highPartitionIndex--; - } - - // Adjust the partition index - highPartitionIndex = lowPartitionIndex; - lowPartitionIndex--; - - // move the partition key to end of low partition - tempSeq = seq[midIndex]; - tempKey = key[midIndex]; - seq[midIndex] = seq[lowPartitionIndex]; - key[midIndex] = key[lowPartitionIndex]; - seq[lowPartitionIndex] = tempSeq; - key[lowPartitionIndex] = tempKey; - - if (highIndex - lowIndex + BWTINC_INSERT_SORT_NUM_ITEM <= EQUAL_KEY_THRESHOLD * numberOfEqualKey) { - - // Many keys = partition key; separate the equal key data from the lower partition - - midIndex = lowIndex; - - for (;;) { - while (midIndex < lowPartitionIndex && key[midIndex] < key[lowPartitionIndex]) { - midIndex++; - } - while (midIndex < lowPartitionIndex && key[lowPartitionIndex] == key[lowPartitionIndex - 1]) { - lowPartitionIndex--; - } - if (midIndex >= lowPartitionIndex) { - break; - } - tempSeq = seq[midIndex]; - tempKey = key[midIndex]; - seq[midIndex] = seq[lowPartitionIndex - 1]; - key[midIndex] = key[lowPartitionIndex - 1]; - seq[lowPartitionIndex - 1] = tempSeq; - key[lowPartitionIndex - 1] = tempKey; - midIndex++; - lowPartitionIndex--; - } - - } - - if (lowPartitionIndex - lowIndex > highIndex - highPartitionIndex) { - // put the larger partition to stack - lowStack[stackDepth] = lowIndex; - highStack[stackDepth] = lowPartitionIndex - 1; - stackDepth++; - // sort the smaller partition first - lowIndex = highPartitionIndex; - } else { - // put the larger partition to stack - lowStack[stackDepth] = highPartitionIndex; - highStack[stackDepth] = highIndex; - stackDepth++; - // sort the smaller partition first - if (lowPartitionIndex > lowIndex) { - highIndex = lowPartitionIndex - 1; - } else { - // all keys in the partition equals to the partition key - break; - } - } - continue; - } - - // Pop a range from stack - if (stackDepth > 0) { - stackDepth--; - lowIndex = lowStack[stackDepth]; - highIndex = highStack[stackDepth]; - continue; - } else return; - } -} - - -static void BWTIncBuildRelativeRank(bgint_t* __restrict sortedRank, bgint_t* __restrict seq, - bgint_t* __restrict relativeRank, const bgint_t numItem, - bgint_t oldInverseSa0, const bgint_t *cumulativeCount) -{ - bgint_t i, c; - bgint_t s, r; - bgint_t lastRank, lastIndex; - bgint_t oldInverseSa0RelativeRank = 0; - bgint_t freq; - - lastIndex = numItem; - lastRank = sortedRank[numItem]; - if (lastRank > oldInverseSa0) { - sortedRank[numItem]--; // to prepare for merging; $ is not encoded in bwt - } - s = seq[numItem]; - relativeRank[s] = numItem; - if (lastRank == oldInverseSa0) { - oldInverseSa0RelativeRank = numItem; - oldInverseSa0++; // so that this segment of code is not run again - lastRank++; // so that oldInverseSa0 become a sorted group with 1 item - } - - c = ALPHABET_SIZE - 1; - freq = cumulativeCount[c]; - - for (i=numItem; i--;) { // from numItem - 1 to 0 - r = sortedRank[i]; - if (r > oldInverseSa0) - sortedRank[i]--; // to prepare for merging; $ is not encoded in bwt - s = seq[i]; - if (i < freq) { - if (lastIndex >= freq) - lastRank++; // to trigger the group across alphabet boundary to be split - c--; - freq = cumulativeCount[c]; - } - if (r == lastRank) { - relativeRank[s] = lastIndex; - } else { - if (i == lastIndex - 1) { - if (lastIndex < numItem && (sbgint_t)seq[lastIndex + 1] < 0) { - seq[lastIndex] = seq[lastIndex + 1] - 1; - } else { - seq[lastIndex] = (bgint_t)-1; - } - } - lastIndex = i; - lastRank = r; - relativeRank[s] = i; - if (r == oldInverseSa0) { - oldInverseSa0RelativeRank = i; - oldInverseSa0++; // so that this segment of code is not run again - lastRank++; // so that oldInverseSa0 become a sorted group with 1 item - } - } - } - -} - -static void BWTIncBuildBwt(unsigned int* insertBwt, const bgint_t *relativeRank, const bgint_t numChar, - const bgint_t *cumulativeCount) -{ - unsigned int c; - bgint_t i; - bgint_t previousRank, currentRank; - - previousRank = relativeRank[0]; - - for (i=1; i<=numChar; i++) { - currentRank = relativeRank[i]; - c = (previousRank >= cumulativeCount[1]) + (previousRank >= cumulativeCount[2]) - + (previousRank >= cumulativeCount[3]); - insertBwt[currentRank] = c; - previousRank = currentRank; - } -} - -static void BWTIncMergeBwt(const bgint_t *sortedRank, const unsigned int* oldBwt, const unsigned int *insertBwt, - unsigned int* __restrict mergedBwt, const bgint_t numOldBwt, const bgint_t numInsertBwt) -{ - unsigned int bitsInWordMinusBitPerChar; - bgint_t leftShift, rightShift; - bgint_t o; - bgint_t oIndex, iIndex, mIndex; - bgint_t mWord, mChar, oWord, oChar; - bgint_t numInsert; - - bitsInWordMinusBitPerChar = BITS_IN_WORD - BIT_PER_CHAR; - - oIndex = 0; - iIndex = 0; - mIndex = 0; - - mWord = 0; - mChar = 0; - - mergedBwt[0] = 0; // this can be cleared as merged Bwt slightly shift to the left in each iteration - - while (oIndex < numOldBwt) { - - // copy from insertBwt - while (iIndex <= numInsertBwt && sortedRank[iIndex] <= oIndex) { - if (sortedRank[iIndex] != 0) { // special value to indicate that this is for new inverseSa0 - mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); - mIndex++; - mChar++; - if (mChar == CHAR_PER_WORD) { - mChar = 0; - mWord++; - mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary - } - } - iIndex++; - } - - // Copy from oldBwt to mergedBwt - if (iIndex <= numInsertBwt) { - o = sortedRank[iIndex]; - } else { - o = numOldBwt; - } - numInsert = o - oIndex; - - oWord = oIndex / CHAR_PER_WORD; - oChar = oIndex - oWord * CHAR_PER_WORD; - if (oChar > mChar) { - leftShift = (oChar - mChar) * BIT_PER_CHAR; - rightShift = (CHAR_PER_WORD + mChar - oChar) * BIT_PER_CHAR; - mergedBwt[mWord] = mergedBwt[mWord] - | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)) - | (oldBwt[oWord+1] >> rightShift); - oIndex += min(numInsert, CHAR_PER_WORD - mChar); - while (o > oIndex) { - oWord++; - mWord++; - mergedBwt[mWord] = (oldBwt[oWord] << leftShift) | (oldBwt[oWord+1] >> rightShift); - oIndex += CHAR_PER_WORD; - } - } else if (oChar < mChar) { - rightShift = (mChar - oChar) * BIT_PER_CHAR; - leftShift = (CHAR_PER_WORD + oChar - mChar) * BIT_PER_CHAR; - mergedBwt[mWord] = mergedBwt[mWord] - | (oldBwt[oWord] << (oChar * BIT_PER_CHAR) >> (mChar * BIT_PER_CHAR)); - oIndex += min(numInsert, CHAR_PER_WORD - mChar); - while (o > oIndex) { - oWord++; - mWord++; - mergedBwt[mWord] = (oldBwt[oWord-1] << leftShift) | (oldBwt[oWord] >> rightShift); - oIndex += CHAR_PER_WORD; - } - } else { // oChar == mChar - mergedBwt[mWord] = mergedBwt[mWord] | truncateLeft(oldBwt[oWord], mChar * BIT_PER_CHAR); - oIndex += min(numInsert, CHAR_PER_WORD - mChar); - while (o > oIndex) { - oWord++; - mWord++; - mergedBwt[mWord] = oldBwt[oWord]; - oIndex += CHAR_PER_WORD; - } - } - oIndex = o; - mIndex += numInsert; - - // Clear the trailing garbage in mergedBwt - mWord = mIndex / CHAR_PER_WORD; - mChar = mIndex - mWord * CHAR_PER_WORD; - if (mChar == 0) { - mergedBwt[mWord] = 0; - } else { - mergedBwt[mWord] = truncateRight(mergedBwt[mWord], (BITS_IN_WORD - mChar * BIT_PER_CHAR)); - } - - } - - // copy from insertBwt - while (iIndex <= numInsertBwt) { - if (sortedRank[iIndex] != 0) { - mergedBwt[mWord] |= insertBwt[iIndex] << (BITS_IN_WORD - (mChar + 1) * BIT_PER_CHAR); - mIndex++; - mChar++; - if (mChar == CHAR_PER_WORD) { - mChar = 0; - mWord++; - mergedBwt[mWord] = 0; // no need to worry about crossing mergedBwt boundary - } - } - iIndex++; - } -} - -void BWTClearTrailingBwtCode(BWT *bwt) -{ - bgint_t bwtResidentSizeInWord; - bgint_t wordIndex, offset; - bgint_t i; - - bwtResidentSizeInWord = BWTResidentSizeInWord(bwt->textLength); - - wordIndex = bwt->textLength / CHAR_PER_WORD; - offset = (bwt->textLength - wordIndex * CHAR_PER_WORD) * BIT_PER_CHAR; - if (offset > 0) { - bwt->bwtCode[wordIndex] = truncateRight(bwt->bwtCode[wordIndex], BITS_IN_WORD - offset); - } else { - if (wordIndex < bwtResidentSizeInWord) { - bwt->bwtCode[wordIndex] = 0; - } - } - - for (i=wordIndex+1; ibwtCode[i] = 0; - } -} - - -void BWTGenerateOccValueFromBwt(const unsigned int* bwt, unsigned int* __restrict occValue, - bgint_t* __restrict occValueMajor, - const bgint_t textLength, const unsigned int* decodeTable) -{ - bgint_t numberOfOccValueMajor, numberOfOccValue; - unsigned int wordBetweenOccValue; - bgint_t numberOfOccIntervalPerMajor; - unsigned int c; - bgint_t i, j; - bgint_t occMajorIndex; - bgint_t occIndex, bwtIndex; - bgint_t sum; // perhaps unsigned is big enough - bgint_t tempOccValue0[ALPHABET_SIZE], tempOccValue1[ALPHABET_SIZE]; - - wordBetweenOccValue = OCC_INTERVAL / CHAR_PER_WORD; - - // Calculate occValue - numberOfOccValue = (textLength + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; // Value at both end for bi-directional encoding - numberOfOccIntervalPerMajor = OCC_INTERVAL_MAJOR / OCC_INTERVAL; - numberOfOccValueMajor = (numberOfOccValue + numberOfOccIntervalPerMajor - 1) / numberOfOccIntervalPerMajor; - - tempOccValue0[0] = 0; - tempOccValue0[1] = 0; - tempOccValue0[2] = 0; - tempOccValue0[3] = 0; - occValueMajor[0] = 0; - occValueMajor[1] = 0; - occValueMajor[2] = 0; - occValueMajor[3] = 0; - - occIndex = 0; - bwtIndex = 0; - for (occMajorIndex=1; occMajorIndex> 16]; - sum += decodeTable[c & 0x0000FFFF]; - bwtIndex++; - } - if (!DNA_OCC_SUM_EXCEPTION(sum)) { - tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[3] += sum; - } else { - if (sum == 0x00000100) { - tempOccValue1[0] += 256; - } else if (sum == 0x00010000) { - tempOccValue1[1] += 256; - } else if (sum == 0x01000000) { - tempOccValue1[2] += 256; - } else { - tempOccValue1[3] += 256; - } - } - occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; - occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; - occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; - occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; - tempOccValue0[0] = tempOccValue1[0]; - tempOccValue0[1] = tempOccValue1[1]; - tempOccValue0[2] = tempOccValue1[2]; - tempOccValue0[3] = tempOccValue1[3]; - sum = 0; - - occIndex++; - - for (j=0; j> 16]; - sum += decodeTable[c & 0x0000FFFF]; - bwtIndex++; - } - if (!DNA_OCC_SUM_EXCEPTION(sum)) { - tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; - tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; - tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; - tempOccValue0[3] += sum; - } else { - if (sum == 0x00000100) { - tempOccValue0[0] += 256; - } else if (sum == 0x00010000) { - tempOccValue0[1] += 256; - } else if (sum == 0x01000000) { - tempOccValue0[2] += 256; - } else { - tempOccValue0[3] += 256; - } - } - } - - occValueMajor[occMajorIndex * 4 + 0] = occValueMajor[(occMajorIndex - 1) * 4 + 0] + tempOccValue0[0]; - occValueMajor[occMajorIndex * 4 + 1] = occValueMajor[(occMajorIndex - 1) * 4 + 1] + tempOccValue0[1]; - occValueMajor[occMajorIndex * 4 + 2] = occValueMajor[(occMajorIndex - 1) * 4 + 2] + tempOccValue0[2]; - occValueMajor[occMajorIndex * 4 + 3] = occValueMajor[(occMajorIndex - 1) * 4 + 3] + tempOccValue0[3]; - tempOccValue0[0] = 0; - tempOccValue0[1] = 0; - tempOccValue0[2] = 0; - tempOccValue0[3] = 0; - - } - - while (occIndex < (numberOfOccValue-1)/2) { - sum = 0; - tempOccValue1[0] = tempOccValue0[0]; - tempOccValue1[1] = tempOccValue0[1]; - tempOccValue1[2] = tempOccValue0[2]; - tempOccValue1[3] = tempOccValue0[3]; - for (j=0; j> 16]; - sum += decodeTable[c & 0x0000FFFF]; - bwtIndex++; - } - if (!DNA_OCC_SUM_EXCEPTION(sum)) { - tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[3] += sum; - } else { - if (sum == 0x00000100) { - tempOccValue1[0] += 256; - } else if (sum == 0x00010000) { - tempOccValue1[1] += 256; - } else if (sum == 0x01000000) { - tempOccValue1[2] += 256; - } else { - tempOccValue1[3] += 256; - } - } - occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; - occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; - occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; - occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; - tempOccValue0[0] = tempOccValue1[0]; - tempOccValue0[1] = tempOccValue1[1]; - tempOccValue0[2] = tempOccValue1[2]; - tempOccValue0[3] = tempOccValue1[3]; - sum = 0; - occIndex++; - - for (j=0; j> 16]; - sum += decodeTable[c & 0x0000FFFF]; - bwtIndex++; - } - if (!DNA_OCC_SUM_EXCEPTION(sum)) { - tempOccValue0[0] += (sum & 0x000000FF); sum >>= 8; - tempOccValue0[1] += (sum & 0x000000FF); sum >>= 8; - tempOccValue0[2] += (sum & 0x000000FF); sum >>= 8; - tempOccValue0[3] += sum; - } else { - if (sum == 0x00000100) { - tempOccValue0[0] += 256; - } else if (sum == 0x00010000) { - tempOccValue0[1] += 256; - } else if (sum == 0x01000000) { - tempOccValue0[2] += 256; - } else { - tempOccValue0[3] += 256; - } - } - } - - sum = 0; - tempOccValue1[0] = tempOccValue0[0]; - tempOccValue1[1] = tempOccValue0[1]; - tempOccValue1[2] = tempOccValue0[2]; - tempOccValue1[3] = tempOccValue0[3]; - - if (occIndex * 2 < numberOfOccValue - 1) { - for (j=0; j> 16]; - sum += decodeTable[c & 0x0000FFFF]; - bwtIndex++; - } - if (!DNA_OCC_SUM_EXCEPTION(sum)) { - tempOccValue1[0] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[1] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[2] += (sum & 0x000000FF); sum >>= 8; - tempOccValue1[3] += sum; - } else { - if (sum == 0x00000100) { - tempOccValue1[0] += 256; - } else if (sum == 0x00010000) { - tempOccValue1[1] += 256; - } else if (sum == 0x01000000) { - tempOccValue1[2] += 256; - } else { - tempOccValue1[3] += 256; - } - } - } - - occValue[occIndex * 4 + 0] = (tempOccValue0[0] << 16) | tempOccValue1[0]; - occValue[occIndex * 4 + 1] = (tempOccValue0[1] << 16) | tempOccValue1[1]; - occValue[occIndex * 4 + 2] = (tempOccValue0[2] << 16) | tempOccValue1[2]; - occValue[occIndex * 4 + 3] = (tempOccValue0[3] << 16) | tempOccValue1[3]; - -} - -static void BWTIncConstruct(BWTInc *bwtInc, const bgint_t numChar) -{ - unsigned int i; - bgint_t mergedBwtSizeInWord, mergedOccSizeInWord; - unsigned int firstCharInThisIteration; - - bgint_t *relativeRank, *seq, *sortedRank; - unsigned int *insertBwt, *mergedBwt; - bgint_t newInverseSa0RelativeRank, oldInverseSa0RelativeRank, newInverseSa0; - - mergedBwtSizeInWord = BWTResidentSizeInWord(bwtInc->bwt->textLength + numChar); - mergedOccSizeInWord = BWTOccValueMinorSizeInWord(bwtInc->bwt->textLength + numChar); - - initializeVAL_bg(bwtInc->cumulativeCountInCurrentBuild, ALPHABET_SIZE + 1, 0); - - if (bwtInc->bwt->textLength == 0) { // Initial build - - // Set address - seq = (bgint_t*)bwtInc->workingMemory; - relativeRank = seq + bwtInc->buildSize + 1; - // mergedBwt and packedTex may share memory - mergedBwt = insertBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord; // build in place - - assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)bwtInc->packedText); - assert((void*)(relativeRank + bwtInc->buildSize + 1) <= (void*)mergedBwt); - - // ->packedText is not used any more and may be overwritten by mergedBwt - BWTIncPutPackedTextToRank(bwtInc->packedText, relativeRank, bwtInc->cumulativeCountInCurrentBuild, numChar); - - firstCharInThisIteration = relativeRank[0]; - relativeRank[numChar] = 0; - - // Sort suffix - QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)ALPHABET_SIZE - 1, 0, FALSE); - newInverseSa0 = relativeRank[0]; - - // Clear BWT area - initializeVAL(insertBwt, mergedBwtSizeInWord, 0); - - // Build BWT - BWTIncBuildPackedBwt(relativeRank, insertBwt, numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->packedShift); - - // so that the cumulativeCount is not deducted - bwtInc->firstCharInLastIteration = ALPHABET_SIZE; - - } else { // Incremental build - // Set address - sortedRank = (bgint_t*)bwtInc->workingMemory; - seq = sortedRank + bwtInc->buildSize + 1; - insertBwt = (unsigned*)seq; // insertBwt and seq share memory - // relativeRank and ->packedText may share memory - relativeRank = seq + bwtInc->buildSize + 1; - - assert((void*)relativeRank <= (void*)bwtInc->packedText); - - // Store the first character of this iteration - firstCharInThisIteration = bwtInc->packedText[0] >> (BITS_IN_WORD - BIT_PER_CHAR); - - // Count occurrence of input text - ForwardDNAAllOccCountNoLimit(bwtInc->packedText, numChar, bwtInc->cumulativeCountInCurrentBuild + 1, bwtInc->bwt->decodeTable); - // Add the first character of the previous iteration to represent the inverseSa0 of the previous iteration - bwtInc->cumulativeCountInCurrentBuild[bwtInc->firstCharInLastIteration + 1]++; - bwtInc->cumulativeCountInCurrentBuild[2] += bwtInc->cumulativeCountInCurrentBuild[1]; - bwtInc->cumulativeCountInCurrentBuild[3] += bwtInc->cumulativeCountInCurrentBuild[2]; - bwtInc->cumulativeCountInCurrentBuild[4] += bwtInc->cumulativeCountInCurrentBuild[3]; - - // Get rank of new suffix among processed suffix - // The seq array is built into ALPHABET_SIZE + 2 groups; ALPHABET_SIZE groups + 1 group divided into 2 by inverseSa0 + inverseSa0 as 1 group - // ->packedText is not used any more and will be overwritten by relativeRank - oldInverseSa0RelativeRank = BWTIncGetAbsoluteRank(bwtInc->bwt, sortedRank, seq, bwtInc->packedText, - numChar, bwtInc->cumulativeCountInCurrentBuild, bwtInc->firstCharInLastIteration); - - // Sort rank by ALPHABET_SIZE + 2 groups (or ALPHABET_SIZE + 1 groups when inverseSa0 sit on the border of a group) - for (i=0; icumulativeCountInCurrentBuild[i] > oldInverseSa0RelativeRank || - bwtInc->cumulativeCountInCurrentBuild[i+1] <= oldInverseSa0RelativeRank) { - BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], bwtInc->cumulativeCountInCurrentBuild[i+1] - bwtInc->cumulativeCountInCurrentBuild[i]); - } else { - if (bwtInc->cumulativeCountInCurrentBuild[i] < oldInverseSa0RelativeRank) { - BWTIncSortKey(sortedRank + bwtInc->cumulativeCountInCurrentBuild[i], seq + bwtInc->cumulativeCountInCurrentBuild[i], oldInverseSa0RelativeRank - bwtInc->cumulativeCountInCurrentBuild[i]); - } - if (bwtInc->cumulativeCountInCurrentBuild[i+1] > oldInverseSa0RelativeRank + 1) { - BWTIncSortKey(sortedRank + oldInverseSa0RelativeRank + 1, seq + oldInverseSa0RelativeRank + 1, bwtInc->cumulativeCountInCurrentBuild[i+1] - oldInverseSa0RelativeRank - 1); - } - } - } - - // build relative rank; sortedRank is updated for merging to cater for the fact that $ is not encoded in bwt - // the cumulative freq information is used to make sure that inverseSa0 and suffix beginning with different characters are kept in different unsorted groups) - BWTIncBuildRelativeRank(sortedRank, seq, relativeRank, numChar, bwtInc->bwt->inverseSa0, bwtInc->cumulativeCountInCurrentBuild); - assert(relativeRank[numChar] == oldInverseSa0RelativeRank); - - // Sort suffix - QSufSortSuffixSort((qsint_t*)relativeRank, (qsint_t*)seq, (qsint_t)numChar, (qsint_t)numChar, 1, TRUE); - - newInverseSa0RelativeRank = relativeRank[0]; - newInverseSa0 = sortedRank[newInverseSa0RelativeRank] + newInverseSa0RelativeRank; - - sortedRank[newInverseSa0RelativeRank] = 0; // a special value so that this is skipped in the merged bwt - - // Build BWT; seq is overwritten by insertBwt - BWTIncBuildBwt(insertBwt, relativeRank, numChar, bwtInc->cumulativeCountInCurrentBuild); - - // Merge BWT; relativeRank may be overwritten by mergedBwt - mergedBwt = bwtInc->workingMemory + bwtInc->availableWord - mergedBwtSizeInWord - - bwtInc->numberOfIterationDone * OCC_INTERVAL / BIT_PER_CHAR * (sizeof(bgint_t) / 4); // minus numberOfIteration * occInterval to create a buffer for merging - assert(mergedBwt >= insertBwt + numChar); - BWTIncMergeBwt(sortedRank, bwtInc->bwt->bwtCode, insertBwt, mergedBwt, bwtInc->bwt->textLength, numChar); - } - - // Build auxiliary structure and update info and pointers in BWT - bwtInc->bwt->textLength += numChar; - bwtInc->bwt->bwtCode = mergedBwt; - bwtInc->bwt->bwtSizeInWord = mergedBwtSizeInWord; - bwtInc->bwt->occSizeInWord = mergedOccSizeInWord; - assert(mergedBwt >= bwtInc->workingMemory + mergedOccSizeInWord); - - bwtInc->bwt->occValue = mergedBwt - mergedOccSizeInWord; - - BWTClearTrailingBwtCode(bwtInc->bwt); - BWTGenerateOccValueFromBwt(bwtInc->bwt->bwtCode, bwtInc->bwt->occValue, bwtInc->bwt->occValueMajor, - bwtInc->bwt->textLength, bwtInc->bwt->decodeTable); - - bwtInc->bwt->inverseSa0 = newInverseSa0; - - bwtInc->bwt->cumulativeFreq[1] += bwtInc->cumulativeCountInCurrentBuild[1] - (bwtInc->firstCharInLastIteration <= 0); - bwtInc->bwt->cumulativeFreq[2] += bwtInc->cumulativeCountInCurrentBuild[2] - (bwtInc->firstCharInLastIteration <= 1); - bwtInc->bwt->cumulativeFreq[3] += bwtInc->cumulativeCountInCurrentBuild[3] - (bwtInc->firstCharInLastIteration <= 2); - bwtInc->bwt->cumulativeFreq[4] += bwtInc->cumulativeCountInCurrentBuild[4] - (bwtInc->firstCharInLastIteration <= 3); - - bwtInc->firstCharInLastIteration = firstCharInThisIteration; - - // Set build size and text address for the next build - BWTIncSetBuildSizeAndTextAddr(bwtInc); - bwtInc->numberOfIterationDone++; - -} - -BWTInc *BWTIncConstructFromPacked(const char *inputFileName, bgint_t initialMaxBuildSize, bgint_t incMaxBuildSize) -{ - - FILE *packedFile; - bgint_t packedFileLen; - bgint_t totalTextLength; - bgint_t textToLoad, textSizeInByte; - bgint_t processedTextLength; - unsigned char lastByteLength; - - BWTInc *bwtInc; - - packedFile = (FILE*)fopen(inputFileName, "rb"); - - if (packedFile == NULL) { - fprintf(stderr, "BWTIncConstructFromPacked() : Cannot open %s : %s\n", - inputFileName, strerror(errno)); - exit(1); - } - - if (fseek(packedFile, -1, SEEK_END) != 0) { - fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", - inputFileName, strerror(errno)); - exit(1); - } - packedFileLen = ftell(packedFile); - if (packedFileLen == -1) { - fprintf(stderr, "BWTIncConstructFromPacked() : Can't ftell on %s : %s\n", - inputFileName, strerror(errno)); - exit(1); - } - if (fread(&lastByteLength, sizeof(unsigned char), 1, packedFile) != 1) { - fprintf(stderr, - "BWTIncConstructFromPacked() : Can't read from %s : %s\n", - inputFileName, - ferror(packedFile)? strerror(errno) : "Unexpected end of file"); - exit(1); - } - totalTextLength = TextLengthFromBytePacked(packedFileLen, BIT_PER_CHAR, lastByteLength); - - bwtInc = BWTIncCreate(totalTextLength, initialMaxBuildSize, incMaxBuildSize); - - BWTIncSetBuildSizeAndTextAddr(bwtInc); - - if (bwtInc->buildSize > totalTextLength) { - textToLoad = totalTextLength; - } else { - textToLoad = totalTextLength - ((totalTextLength - bwtInc->buildSize + CHAR_PER_WORD - 1) / CHAR_PER_WORD * CHAR_PER_WORD); - } - textSizeInByte = textToLoad / CHAR_PER_BYTE; // excluded the odd byte - - if (fseek(packedFile, -((long)textSizeInByte + 2), SEEK_CUR) != 0) { - fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", - inputFileName, strerror(errno)); - exit(1); - } - if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte + 1, packedFile) != textSizeInByte + 1) { - fprintf(stderr, - "BWTIncConstructFromPacked() : Can't read from %s : %s\n", - inputFileName, - ferror(packedFile)? strerror(errno) : "Unexpected end of file"); - exit(1); - } - if (fseek(packedFile, -((long)textSizeInByte + 1), SEEK_CUR) != 0) { - fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", - inputFileName, strerror(errno)); - exit(1); - } - - ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); - BWTIncConstruct(bwtInc, textToLoad); - - processedTextLength = textToLoad; - - while (processedTextLength < totalTextLength) { - textToLoad = bwtInc->buildSize / CHAR_PER_WORD * CHAR_PER_WORD; - if (textToLoad > totalTextLength - processedTextLength) { - textToLoad = totalTextLength - processedTextLength; - } - textSizeInByte = textToLoad / CHAR_PER_BYTE; - if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { - fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", - inputFileName, strerror(errno)); - exit(1); - } - if (fread(bwtInc->textBuffer, sizeof(unsigned char), textSizeInByte, packedFile) != textSizeInByte) { - fprintf(stderr, - "BWTIncConstructFromPacked() : Can't read from %s : %s\n", - inputFileName, - ferror(packedFile)? strerror(errno) : "Unexpected end of file"); - exit(1); - } - if (fseek(packedFile, -((long)textSizeInByte), SEEK_CUR) != 0) { - fprintf(stderr, "BWTIncConstructFromPacked() : Can't seek on %s : %s\n", - inputFileName, strerror(errno)); - exit(1); - } - ConvertBytePackedToWordPacked(bwtInc->textBuffer, bwtInc->packedText, ALPHABET_SIZE, textToLoad); - BWTIncConstruct(bwtInc, textToLoad); - processedTextLength += textToLoad; - if (bwtInc->numberOfIterationDone % 10 == 0) { - fprintf(stderr, "[BWTIncConstructFromPacked] %lu iterations done. %lu characters processed.\n", - (long)bwtInc->numberOfIterationDone, (long)processedTextLength); - } - } - return bwtInc; -} - -void BWTFree(BWT *bwt) -{ - if (bwt == 0) return; - free(bwt->cumulativeFreq); - free(bwt->bwtCode); - free(bwt->occValue); - free(bwt->occValueMajor); - free(bwt->decodeTable); - free(bwt); -} - -void BWTIncFree(BWTInc *bwtInc) -{ - if (bwtInc == 0) return; - free(bwtInc->bwt); - free(bwtInc->workingMemory); - free(bwtInc); -} - -static bgint_t BWTFileSizeInWord(const bgint_t numChar) -{ - // The $ in BWT at the position of inverseSa0 is not encoded - return (numChar + CHAR_PER_WORD - 1) / CHAR_PER_WORD; -} - -void BWTSaveBwtCodeAndOcc(const BWT *bwt, const char *bwtFileName, const char *occValueFileName) -{ - FILE *bwtFile; -/* FILE *occValueFile; */ - bgint_t bwtLength; - - bwtFile = (FILE*)fopen(bwtFileName, "wb"); - if (bwtFile == NULL) { - fprintf(stderr, - "BWTSaveBwtCodeAndOcc(): Cannot open %s for writing: %s\n", - bwtFileName, strerror(errno)); - exit(1); - } - - bwtLength = BWTFileSizeInWord(bwt->textLength); - - if (fwrite(&bwt->inverseSa0, sizeof(bgint_t), 1, bwtFile) != 1 - || fwrite(bwt->cumulativeFreq + 1, - sizeof(bgint_t), ALPHABET_SIZE, bwtFile) != ALPHABET_SIZE - || fwrite(bwt->bwtCode, - sizeof(unsigned int), bwtLength, bwtFile) != bwtLength) { - fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error writing to %s : %s\n", - bwtFileName, strerror(errno)); - exit(1); - } - if (fclose(bwtFile) != 0) { - fprintf(stderr, "BWTSaveBwtCodeAndOcc(): Error on closing %s : %s\n", - bwtFileName, strerror(errno)); - exit(1); - } -} - -void bwt_bwtgen(const char *fn_pac, const char *fn_bwt) -{ - BWTInc *bwtInc; - bwtInc = BWTIncConstructFromPacked(fn_pac, 10000000, 10000000); - printf("[bwt_gen] Finished constructing BWT in %u iterations.\n", bwtInc->numberOfIterationDone); - BWTSaveBwtCodeAndOcc(bwtInc->bwt, fn_bwt, 0); - BWTIncFree(bwtInc); -} - -int bwt_bwtgen_main(int argc, char *argv[]) -{ - if (argc < 3) { - fprintf(stderr, "Usage: bwtgen \n"); - return 1; - } - bwt_bwtgen(argv[1], argv[2]); - return 0; -} - -#ifdef MAIN_BWT_GEN - -int main(int argc, char *argv[]) -{ - return bwt_bwtgen_main(argc, argv); -} - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwt_lite.c --- a/bwa-0.7.9a/bwt_lite.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,98 +0,0 @@ -#include -#include -#include -#include "bwt_lite.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -int is_sa(const uint8_t *T, int *SA, int n); -int is_bwt(uint8_t *T, int n); - -bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq) -{ - bwtl_t *b; - int i; - b = (bwtl_t*)calloc(1, sizeof(bwtl_t)); - b->seq_len = len; - - { // calculate b->bwt - uint8_t *s; - b->sa = (uint32_t*)calloc(len + 1, 4); - is_sa(seq, (int*)b->sa, len); - s = (uint8_t*)calloc(len + 1, 1); - for (i = 0; i <= len; ++i) { - if (b->sa[i] == 0) b->primary = i; - else s[i] = seq[b->sa[i] - 1]; - } - for (i = b->primary; i < len; ++i) s[i] = s[i + 1]; - b->bwt_size = (len + 15) / 16; - b->bwt = (uint32_t*)calloc(b->bwt_size, 4); - for (i = 0; i < len; ++i) - b->bwt[i>>4] |= s[i] << ((15 - (i&15)) << 1); - free(s); - } - { // calculate b->occ - uint32_t c[4]; - b->n_occ = (len + 15) / 16 * 4; - b->occ = (uint32_t*)calloc(b->n_occ, 4); - memset(c, 0, 16); - for (i = 0; i < len; ++i) { - if (i % 16 == 0) - memcpy(b->occ + (i/16) * 4, c, 16); - ++c[bwtl_B0(b, i)]; - } - memcpy(b->L2+1, c, 16); - for (i = 2; i < 5; ++i) b->L2[i] += b->L2[i-1]; - } - { // generate cnt_table - for (i = 0; i != 256; ++i) { - u_int32_t j, x = 0; - for (j = 0; j != 4; ++j) - x |= (((i&3) == j) + ((i>>2&3) == j) + ((i>>4&3) == j) + (i>>6 == j)) << (j<<3); - b->cnt_table[i] = x; - } - } - return b; -} -uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c) -{ - uint32_t n, b; - if (k == bwt->seq_len) return bwt->L2[c+1] - bwt->L2[c]; - if (k == (uint32_t)(-1)) return 0; - if (k >= bwt->primary) --k; // because $ is not in bwt - n = bwt->occ[k/16<<2|c]; - b = bwt->bwt[k/16] & ~((1U<<((15-(k&15))<<1)) - 1); - n += (bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] - + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]) >> (c<<3) & 0xff; - if (c == 0) n -= 15 - (k&15); // corrected for the masked bits - return n; -} -void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]) -{ - uint32_t x, b; - if (k == (uint32_t)(-1)) { - memset(cnt, 0, 16); - return; - } - if (k >= bwt->primary) --k; // because $ is not in bwt - memcpy(cnt, bwt->occ + (k>>4<<2), 16); - b = bwt->bwt[k>>4] & ~((1U<<((~k&15)<<1)) - 1); - x = bwt->cnt_table[b&0xff] + bwt->cnt_table[b>>8&0xff] - + bwt->cnt_table[b>>16&0xff] + bwt->cnt_table[b>>24]; - x -= 15 - (k&15); - cnt[0] += x&0xff; cnt[1] += x>>8&0xff; cnt[2] += x>>16&0xff; cnt[3] += x>>24; -} -void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]) -{ - bwtl_occ4(bwt, k, cntk); - bwtl_occ4(bwt, l, cntl); -} -void bwtl_destroy(bwtl_t *bwt) -{ - if (bwt) { - free(bwt->occ); free(bwt->bwt); free(bwt->sa); - free(bwt); - } -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwt_lite.h --- a/bwa-0.7.9a/bwt_lite.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,29 +0,0 @@ -#ifndef BWT_LITE_H_ -#define BWT_LITE_H_ - -#include - -typedef struct { - uint32_t seq_len, bwt_size, n_occ; - uint32_t primary; - uint32_t *bwt, *occ, *sa, L2[5]; - uint32_t cnt_table[256]; -} bwtl_t; - -#define bwtl_B0(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) - -#ifdef __cplusplus -extern "C" { -#endif - - bwtl_t *bwtl_seq2bwtl(int len, const uint8_t *seq); - uint32_t bwtl_occ(const bwtl_t *bwt, uint32_t k, uint8_t c); - void bwtl_occ4(const bwtl_t *bwt, uint32_t k, uint32_t cnt[4]); - void bwtl_2occ4(const bwtl_t *bwt, uint32_t k, uint32_t l, uint32_t cntk[4], uint32_t cntl[4]); - void bwtl_destroy(bwtl_t *bwt); - -#ifdef __cplusplus -} -#endif - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtaln.c --- a/bwa-0.7.9a/bwtaln.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,320 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif -#include "bwtaln.h" -#include "bwtgap.h" -#include "utils.h" -#include "bwa.h" - -#ifdef HAVE_PTHREAD -#include -#endif - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -gap_opt_t *gap_init_opt() -{ - gap_opt_t *o; - o = (gap_opt_t*)calloc(1, sizeof(gap_opt_t)); - /* IMPORTANT: s_mm*10 should be about the average base error - rate. Voilating this requirement will break pairing! */ - o->s_mm = 3; o->s_gapo = 11; o->s_gape = 4; - o->max_diff = -1; o->max_gapo = 1; o->max_gape = 6; - o->indel_end_skip = 5; o->max_del_occ = 10; o->max_entries = 2000000; - o->mode = BWA_MODE_GAPE | BWA_MODE_COMPREAD; - o->seed_len = 32; o->max_seed_diff = 2; - o->fnr = 0.04; - o->n_threads = 1; - o->max_top2 = 30; - o->trim_qual = 0; - return o; -} - -int bwa_cal_maxdiff(int l, double err, double thres) -{ - double elambda = exp(-l * err); - double sum, y = 1.0; - int k, x = 1; - for (k = 1, sum = elambda; k < 1000; ++k) { - y *= l * err; - x *= k; - sum += elambda * y / x; - if (1.0 - sum < thres) return k; - } - return 2; -} - -// width must be filled as zero -int bwt_cal_width(const bwt_t *bwt, int len, const ubyte_t *str, bwt_width_t *width) -{ - bwtint_t k, l, ok, ol; - int i, bid; - bid = 0; - k = 0; l = bwt->seq_len; - for (i = 0; i < len; ++i) { - ubyte_t c = str[i]; - if (c < 4) { - bwt_2occ(bwt, k - 1, l, c, &ok, &ol); - k = bwt->L2[c] + ok + 1; - l = bwt->L2[c] + ol; - } - if (k > l || c > 3) { // then restart - k = 0; - l = bwt->seq_len; - ++bid; - } - width[i].w = l - k + 1; - width[i].bid = bid; - } - width[len].w = 0; - width[len].bid = ++bid; - return bid; -} - -void bwa_cal_sa_reg_gap(int tid, bwt_t *const bwt, int n_seqs, bwa_seq_t *seqs, const gap_opt_t *opt) -{ - int i, j, max_l = 0, max_len; - gap_stack_t *stack; - bwt_width_t *w, *seed_w; - gap_opt_t local_opt = *opt; - - // initiate priority stack - for (i = max_len = 0; i != n_seqs; ++i) - if (seqs[i].len > max_len) max_len = seqs[i].len; - if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(max_len, BWA_AVG_ERR, opt->fnr); - if (local_opt.max_diff < local_opt.max_gapo) local_opt.max_gapo = local_opt.max_diff; - stack = gap_init_stack(local_opt.max_diff, local_opt.max_gapo, local_opt.max_gape, &local_opt); - - seed_w = (bwt_width_t*)calloc(opt->seed_len+1, sizeof(bwt_width_t)); - w = 0; - for (i = 0; i != n_seqs; ++i) { - bwa_seq_t *p = seqs + i; -#ifdef HAVE_PTHREAD - if (i % opt->n_threads != tid) continue; -#endif - p->sa = 0; p->type = BWA_TYPE_NO_MATCH; p->c1 = p->c2 = 0; p->n_aln = 0; p->aln = 0; - if (max_l < p->len) { - max_l = p->len; - w = (bwt_width_t*)realloc(w, (max_l + 1) * sizeof(bwt_width_t)); - memset(w, 0, (max_l + 1) * sizeof(bwt_width_t)); - } - bwt_cal_width(bwt, p->len, p->seq, w); - if (opt->fnr > 0.0) local_opt.max_diff = bwa_cal_maxdiff(p->len, BWA_AVG_ERR, opt->fnr); - local_opt.seed_len = opt->seed_len < p->len? opt->seed_len : 0x7fffffff; - if (p->len > opt->seed_len) - bwt_cal_width(bwt, opt->seed_len, p->seq + (p->len - opt->seed_len), seed_w); - // core function - for (j = 0; j < p->len; ++j) // we need to complement - p->seq[j] = p->seq[j] > 3? 4 : 3 - p->seq[j]; - p->aln = bwt_match_gap(bwt, p->len, p->seq, w, p->len <= opt->seed_len? 0 : seed_w, &local_opt, &p->n_aln, stack); - //fprintf(stderr, "mm=%lld,ins=%lld,del=%lld,gapo=%lld\n", p->aln->n_mm, p->aln->n_ins, p->aln->n_del, p->aln->n_gapo); - // clean up the unused data in the record - free(p->name); free(p->seq); free(p->rseq); free(p->qual); - p->name = 0; p->seq = p->rseq = p->qual = 0; - } - free(seed_w); free(w); - gap_destroy_stack(stack); -} - -#ifdef HAVE_PTHREAD -typedef struct { - int tid; - bwt_t *bwt; - int n_seqs; - bwa_seq_t *seqs; - const gap_opt_t *opt; -} thread_aux_t; - -static void *worker(void *data) -{ - thread_aux_t *d = (thread_aux_t*)data; - bwa_cal_sa_reg_gap(d->tid, d->bwt, d->n_seqs, d->seqs, d->opt); - return 0; -} -#endif - -bwa_seqio_t *bwa_open_reads(int mode, const char *fn_fa) -{ - bwa_seqio_t *ks; - if (mode & BWA_MODE_BAM) { // open BAM - int which = 0; - if (mode & BWA_MODE_BAM_SE) which |= 4; - if (mode & BWA_MODE_BAM_READ1) which |= 1; - if (mode & BWA_MODE_BAM_READ2) which |= 2; - if (which == 0) which = 7; // then read all reads - ks = bwa_bam_open(fn_fa, which); - } else ks = bwa_seq_open(fn_fa); - return ks; -} - -void bwa_aln_core(const char *prefix, const char *fn_fa, const gap_opt_t *opt) -{ - int i, n_seqs, tot_seqs = 0; - bwa_seq_t *seqs; - bwa_seqio_t *ks; - clock_t t; - bwt_t *bwt; - - // initialization - ks = bwa_open_reads(opt->mode, fn_fa); - - { // load BWT - char *str = (char*)calloc(strlen(prefix) + 10, 1); - strcpy(str, prefix); strcat(str, ".bwt"); bwt = bwt_restore_bwt(str); - free(str); - } - - // core loop - err_fwrite(SAI_MAGIC, 1, 4, stdout); - err_fwrite(opt, sizeof(gap_opt_t), 1, stdout); - while ((seqs = bwa_read_seq(ks, 0x40000, &n_seqs, opt->mode, opt->trim_qual)) != 0) { - tot_seqs += n_seqs; - t = clock(); - - fprintf(stderr, "[bwa_aln_core] calculate SA coordinate... "); - -#ifdef HAVE_PTHREAD - if (opt->n_threads <= 1) { // no multi-threading at all - bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); - } else { - pthread_t *tid; - pthread_attr_t attr; - thread_aux_t *data; - int j; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (j = 0; j < opt->n_threads; ++j) { - data[j].tid = j; data[j].bwt = bwt; - data[j].n_seqs = n_seqs; data[j].seqs = seqs; data[j].opt = opt; - pthread_create(&tid[j], &attr, worker, data + j); - } - for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); - free(data); free(tid); - } -#else - bwa_cal_sa_reg_gap(0, bwt, n_seqs, seqs, opt); -#endif - - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - - t = clock(); - fprintf(stderr, "[bwa_aln_core] write to the disk... "); - for (i = 0; i < n_seqs; ++i) { - bwa_seq_t *p = seqs + i; - err_fwrite(&p->n_aln, 4, 1, stdout); - if (p->n_aln) err_fwrite(p->aln, sizeof(bwt_aln1_t), p->n_aln, stdout); - } - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - - bwa_free_read_seq(n_seqs, seqs); - fprintf(stderr, "[bwa_aln_core] %d sequences have been processed.\n", tot_seqs); - } - - // destroy - bwt_destroy(bwt); - bwa_seq_close(ks); -} - -int bwa_aln(int argc, char *argv[]) -{ - int c, opte = -1; - gap_opt_t *opt; - char *prefix; - - opt = gap_init_opt(); - while ((c = getopt(argc, argv, "n:o:e:i:d:l:k:LR:m:t:NM:O:E:q:f:b012IYB:")) >= 0) { - switch (c) { - case 'n': - if (strstr(optarg, ".")) opt->fnr = atof(optarg), opt->max_diff = -1; - else opt->max_diff = atoi(optarg), opt->fnr = -1.0; - break; - case 'o': opt->max_gapo = atoi(optarg); break; - case 'e': opte = atoi(optarg); break; - case 'M': opt->s_mm = atoi(optarg); break; - case 'O': opt->s_gapo = atoi(optarg); break; - case 'E': opt->s_gape = atoi(optarg); break; - case 'd': opt->max_del_occ = atoi(optarg); break; - case 'i': opt->indel_end_skip = atoi(optarg); break; - case 'l': opt->seed_len = atoi(optarg); break; - case 'k': opt->max_seed_diff = atoi(optarg); break; - case 'm': opt->max_entries = atoi(optarg); break; - case 't': opt->n_threads = atoi(optarg); break; - case 'L': opt->mode |= BWA_MODE_LOGGAP; break; - case 'R': opt->max_top2 = atoi(optarg); break; - case 'q': opt->trim_qual = atoi(optarg); break; - case 'N': opt->mode |= BWA_MODE_NONSTOP; opt->max_top2 = 0x7fffffff; break; - case 'f': xreopen(optarg, "wb", stdout); break; - case 'b': opt->mode |= BWA_MODE_BAM; break; - case '0': opt->mode |= BWA_MODE_BAM_SE; break; - case '1': opt->mode |= BWA_MODE_BAM_READ1; break; - case '2': opt->mode |= BWA_MODE_BAM_READ2; break; - case 'I': opt->mode |= BWA_MODE_IL13; break; - case 'Y': opt->mode |= BWA_MODE_CFY; break; - case 'B': opt->mode |= atoi(optarg) << 24; break; - default: return 1; - } - } - if (opte > 0) { - opt->max_gape = opte; - opt->mode &= ~BWA_MODE_GAPE; - } - - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa aln [options] \n\n"); - fprintf(stderr, "Options: -n NUM max #diff (int) or missing prob under %.2f err rate (float) [%.2f]\n", - BWA_AVG_ERR, opt->fnr); - fprintf(stderr, " -o INT maximum number or fraction of gap opens [%d]\n", opt->max_gapo); - fprintf(stderr, " -e INT maximum number of gap extensions, -1 for disabling long gaps [-1]\n"); - fprintf(stderr, " -i INT do not put an indel within INT bp towards the ends [%d]\n", opt->indel_end_skip); - fprintf(stderr, " -d INT maximum occurrences for extending a long deletion [%d]\n", opt->max_del_occ); - fprintf(stderr, " -l INT seed length [%d]\n", opt->seed_len); - fprintf(stderr, " -k INT maximum differences in the seed [%d]\n", opt->max_seed_diff); - fprintf(stderr, " -m INT maximum entries in the queue [%d]\n", opt->max_entries); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -M INT mismatch penalty [%d]\n", opt->s_mm); - fprintf(stderr, " -O INT gap open penalty [%d]\n", opt->s_gapo); - fprintf(stderr, " -E INT gap extension penalty [%d]\n", opt->s_gape); - fprintf(stderr, " -R INT stop searching when there are >INT equally best hits [%d]\n", opt->max_top2); - fprintf(stderr, " -q INT quality threshold for read trimming down to %dbp [%d]\n", BWA_MIN_RDLEN, opt->trim_qual); - fprintf(stderr, " -f FILE file to write output to instead of stdout\n"); - fprintf(stderr, " -B INT length of barcode\n"); - fprintf(stderr, " -L log-scaled gap penalty for long deletions\n"); - fprintf(stderr, " -N non-iterative mode: search for all n-difference hits (slooow)\n"); - fprintf(stderr, " -I the input is in the Illumina 1.3+ FASTQ-like format\n"); - fprintf(stderr, " -b the input read file is in the BAM format\n"); - fprintf(stderr, " -0 use single-end reads only (effective with -b)\n"); - fprintf(stderr, " -1 use the 1st read in a pair (effective with -b)\n"); - fprintf(stderr, " -2 use the 2nd read in a pair (effective with -b)\n"); - fprintf(stderr, " -Y filter Casava-filtered sequences\n"); - fprintf(stderr, "\n"); - return 1; - } - if (opt->fnr > 0.0) { - int i, k; - for (i = 17, k = 0; i <= 250; ++i) { - int l = bwa_cal_maxdiff(i, BWA_AVG_ERR, opt->fnr); - if (l != k) fprintf(stderr, "[bwa_aln] %dbp reads: max_diff = %d\n", i, l); - k = l; - } - } - if ((prefix = bwa_idx_infer_prefix(argv[optind])) == 0) { - fprintf(stderr, "[%s] fail to locate the index\n", __func__); - free(opt); - return 1; - } - bwa_aln_core(prefix, argv[optind+1], opt); - free(opt); free(prefix); - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtaln.h --- a/bwa-0.7.9a/bwtaln.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,153 +0,0 @@ -#ifndef BWTALN_H -#define BWTALN_H - -#include -#include "bwt.h" - -#define BWA_TYPE_NO_MATCH 0 -#define BWA_TYPE_UNIQUE 1 -#define BWA_TYPE_REPEAT 2 -#define BWA_TYPE_MATESW 3 - -#define SAM_FPD 1 // paired -#define SAM_FPP 2 // properly paired -#define SAM_FSU 4 // self-unmapped -#define SAM_FMU 8 // mate-unmapped -#define SAM_FSR 16 // self on the reverse strand -#define SAM_FMR 32 // mate on the reverse strand -#define SAM_FR1 64 // this is read one -#define SAM_FR2 128 // this is read two -#define SAM_FSC 256 // secondary alignment - -#define BWA_AVG_ERR 0.02 -#define BWA_MIN_RDLEN 35 // for read trimming - -#define BWA_MAX_BCLEN 63 // maximum barcode length; 127 is the maximum - -#ifndef bns_pac -#define bns_pac(pac, k) ((pac)[(k)>>2] >> ((~(k)&3)<<1) & 3) -#endif - -#define FROM_M 0 -#define FROM_I 1 -#define FROM_D 2 -#define FROM_S 3 - -#define SAI_MAGIC "SAI\1" - -typedef struct { - bwtint_t w; - int bid; -} bwt_width_t; - -typedef struct { - uint64_t n_mm:8, n_gapo:8, n_gape:8, score:20, n_ins:10, n_del:10; - bwtint_t k, l; -} bwt_aln1_t; - -typedef uint16_t bwa_cigar_t; -/* rgoya: If changing order of bytes, beware of operations like: - * s->cigar[0] += s->full_len - s->len; - */ -#define CIGAR_OP_SHIFT 14 -#define CIGAR_LN_MASK 0x3fff - -#define __cigar_op(__cigar) ((__cigar)>>CIGAR_OP_SHIFT) -#define __cigar_len(__cigar) ((__cigar)&CIGAR_LN_MASK) -#define __cigar_create(__op, __len) ((__op)< -#include -#include -#include "bwtgap.h" -#include "bwtaln.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#define STATE_M 0 -#define STATE_I 1 -#define STATE_D 2 - -#define aln_score(m,o,e,p) ((m)*(p)->s_mm + (o)*(p)->s_gapo + (e)*(p)->s_gape) - -gap_stack_t *gap_init_stack2(int max_score) -{ - gap_stack_t *stack; - stack = (gap_stack_t*)calloc(1, sizeof(gap_stack_t)); - stack->n_stacks = max_score; - stack->stacks = (gap_stack1_t*)calloc(stack->n_stacks, sizeof(gap_stack1_t)); - return stack; -} - -gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt) -{ - return gap_init_stack2(aln_score(max_mm+1, max_gapo+1, max_gape+1, opt)); -} - -void gap_destroy_stack(gap_stack_t *stack) -{ - int i; - for (i = 0; i != stack->n_stacks; ++i) free(stack->stacks[i].stack); - free(stack->stacks); - free(stack); -} - -static void gap_reset_stack(gap_stack_t *stack) -{ - int i; - for (i = 0; i != stack->n_stacks; ++i) - stack->stacks[i].n_entries = 0; - stack->best = stack->n_stacks; - stack->n_entries = 0; -} - -static inline void gap_push(gap_stack_t *stack, int i, bwtint_t k, bwtint_t l, int n_mm, int n_gapo, int n_gape, int n_ins, int n_del, - int state, int is_diff, const gap_opt_t *opt) -{ - int score; - gap_entry_t *p; - gap_stack1_t *q; - score = aln_score(n_mm, n_gapo, n_gape, opt); - q = stack->stacks + score; - if (q->n_entries == q->m_entries) { - q->m_entries = q->m_entries? q->m_entries<<1 : 4; - q->stack = (gap_entry_t*)realloc(q->stack, sizeof(gap_entry_t) * q->m_entries); - } - p = q->stack + q->n_entries; - p->info = (u_int32_t)score<<21 | i; p->k = k; p->l = l; - p->n_mm = n_mm; p->n_gapo = n_gapo; p->n_gape = n_gape; - p->n_ins = n_ins; p->n_del = n_del; - p->state = state; - p->last_diff_pos = is_diff? i : 0; - ++(q->n_entries); - ++(stack->n_entries); - if (stack->best > score) stack->best = score; -} - -static inline void gap_pop(gap_stack_t *stack, gap_entry_t *e) -{ - gap_stack1_t *q; - q = stack->stacks + stack->best; - *e = q->stack[q->n_entries - 1]; - --(q->n_entries); - --(stack->n_entries); - if (q->n_entries == 0 && stack->n_entries) { // reset best - int i; - for (i = stack->best + 1; i < stack->n_stacks; ++i) - if (stack->stacks[i].n_entries != 0) break; - stack->best = i; - } else if (stack->n_entries == 0) stack->best = stack->n_stacks; -} - -static inline void gap_shadow(int x, int len, bwtint_t max, int last_diff_pos, bwt_width_t *w) -{ - int i, j; - for (i = j = 0; i < last_diff_pos; ++i) { - if (w[i].w > x) w[i].w -= x; - else if (w[i].w == x) { - w[i].bid = 1; - w[i].w = max - (++j); - } // else should not happen - } -} - -static inline int int_log2(uint32_t v) -{ - int c = 0; - if (v & 0xffff0000u) { v >>= 16; c |= 16; } - if (v & 0xff00) { v >>= 8; c |= 8; } - if (v & 0xf0) { v >>= 4; c |= 4; } - if (v & 0xc) { v >>= 2; c |= 2; } - if (v & 0x2) c |= 1; - return c; -} - -bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *width, - bwt_width_t *seed_width, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack) -{ // $seq is the reverse complement of the input read - int best_score = aln_score(opt->max_diff+1, opt->max_gapo+1, opt->max_gape+1, opt); - int best_diff = opt->max_diff + 1, max_diff = opt->max_diff; - int best_cnt = 0; - int max_entries = 0, j, _j, n_aln, m_aln; - bwt_aln1_t *aln; - - m_aln = 4; n_aln = 0; - aln = (bwt_aln1_t*)calloc(m_aln, sizeof(bwt_aln1_t)); - - // check whether there are too many N - for (j = _j = 0; j < len; ++j) - if (seq[j] > 3) ++_j; - if (_j > max_diff) { - *_n_aln = n_aln; - return aln; - } - - //for (j = 0; j != len; ++j) printf("#0 %d: [%d,%u]\t[%d,%u]\n", j, w[0][j].bid, w[0][j].w, w[1][j].bid, w[1][j].w); - gap_reset_stack(stack); // reset stack - gap_push(stack, len, 0, bwt->seq_len, 0, 0, 0, 0, 0, 0, 0, opt); - - while (stack->n_entries) { - gap_entry_t e; - int i, m, m_seed = 0, hit_found, allow_diff, allow_M, tmp; - bwtint_t k, l, cnt_k[4], cnt_l[4], occ; - - if (max_entries < stack->n_entries) max_entries = stack->n_entries; - if (stack->n_entries > opt->max_entries) break; - gap_pop(stack, &e); // get the best entry - k = e.k; l = e.l; // SA interval - i = e.info&0xffff; // length - if (!(opt->mode & BWA_MODE_NONSTOP) && e.info>>21 > best_score + opt->s_mm) break; // no need to proceed - - m = max_diff - (e.n_mm + e.n_gapo); - if (opt->mode & BWA_MODE_GAPE) m -= e.n_gape; - if (m < 0) continue; - if (seed_width) { // apply seeding - m_seed = opt->max_seed_diff - (e.n_mm + e.n_gapo); - if (opt->mode & BWA_MODE_GAPE) m_seed -= e.n_gape; - } - //printf("#1\t[%d,%d,%d,%c]\t[%d,%d,%d]\t[%u,%u]\t[%u,%u]\t%d\n", stack->n_entries, a, i, "MID"[e.state], e.n_mm, e.n_gapo, e.n_gape, width[i-1].bid, width[i-1].w, k, l, e.last_diff_pos); - if (i > 0 && m < width[i-1].bid) continue; - - // check whether a hit is found - hit_found = 0; - if (i == 0) hit_found = 1; - else if (m == 0 && (e.state == STATE_M || (opt->mode&BWA_MODE_GAPE) || e.n_gape == opt->max_gape)) { // no diff allowed - if (bwt_match_exact_alt(bwt, i, seq, &k, &l)) hit_found = 1; - else continue; // no hit, skip - } - - if (hit_found) { // action for found hits - int score = aln_score(e.n_mm, e.n_gapo, e.n_gape, opt); - int do_add = 1; - //printf("#2 hits found: %d:(%u,%u)\n", e.n_mm+e.n_gapo, k, l); - if (n_aln == 0) { - best_score = score; - best_diff = e.n_mm + e.n_gapo; - if (opt->mode & BWA_MODE_GAPE) best_diff += e.n_gape; - if (!(opt->mode & BWA_MODE_NONSTOP)) - max_diff = (best_diff + 1 > opt->max_diff)? opt->max_diff : best_diff + 1; // top2 behaviour - } - if (score == best_score) best_cnt += l - k + 1; - else if (best_cnt > opt->max_top2) break; // top2b behaviour - if (e.n_gapo) { // check whether the hit has been found. this may happen when a gap occurs in a tandem repeat - for (j = 0; j != n_aln; ++j) - if (aln[j].k == k && aln[j].l == l) break; - if (j < n_aln) do_add = 0; - } - if (do_add) { // append - bwt_aln1_t *p; - gap_shadow(l - k + 1, len, bwt->seq_len, e.last_diff_pos, width); - if (n_aln == m_aln) { - m_aln <<= 1; - aln = (bwt_aln1_t*)realloc(aln, m_aln * sizeof(bwt_aln1_t)); - memset(aln + m_aln/2, 0, m_aln/2*sizeof(bwt_aln1_t)); - } - p = aln + n_aln; - p->n_mm = e.n_mm; p->n_gapo = e.n_gapo; p->n_gape = e.n_gape; - p->n_ins = e.n_ins; p->n_del = e.n_del; - p->k = k; p->l = l; - p->score = score; - //fprintf(stderr, "*** n_mm=%d,n_gapo=%d,n_gape=%d,n_ins=%d,n_del=%d\n", e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del); - ++n_aln; - } - continue; - } - - --i; - bwt_2occ4(bwt, k - 1, l, cnt_k, cnt_l); // retrieve Occ values - occ = l - k + 1; - // test whether diff is allowed - allow_diff = allow_M = 1; - if (i > 0) { - int ii = i - (len - opt->seed_len); - if (width[i-1].bid > m-1) allow_diff = 0; - else if (width[i-1].bid == m-1 && width[i].bid == m-1 && width[i-1].w == width[i].w) allow_M = 0; - if (seed_width && ii > 0) { - if (seed_width[ii-1].bid > m_seed-1) allow_diff = 0; - else if (seed_width[ii-1].bid == m_seed-1 && seed_width[ii].bid == m_seed-1 - && seed_width[ii-1].w == seed_width[ii].w) allow_M = 0; - } - } - // indels - tmp = (opt->mode & BWA_MODE_LOGGAP)? int_log2(e.n_gape + e.n_gapo)/2+1 : e.n_gapo + e.n_gape; - if (allow_diff && i >= opt->indel_end_skip + tmp && len - i >= opt->indel_end_skip + tmp) { - if (e.state == STATE_M) { // gap open - if (e.n_gapo < opt->max_gapo) { // gap open is allowed - // insertion - gap_push(stack, i, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins + 1, e.n_del, STATE_I, 1, opt); - // deletion - for (j = 0; j != 4; ++j) { - k = bwt->L2[j] + cnt_k[j] + 1; - l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo + 1, e.n_gape, e.n_ins, e.n_del + 1, STATE_D, 1, opt); - } - } - } else if (e.state == STATE_I) { // extention of an insertion - if (e.n_gape < opt->max_gape) // gap extention is allowed - gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins + 1, e.n_del, STATE_I, 1, opt); - } else if (e.state == STATE_D) { // extention of a deletion - if (e.n_gape < opt->max_gape) { // gap extention is allowed - if (e.n_gape + e.n_gapo < max_diff || occ < opt->max_del_occ) { - for (j = 0; j != 4; ++j) { - k = bwt->L2[j] + cnt_k[j] + 1; - l = bwt->L2[j] + cnt_l[j]; - if (k <= l) gap_push(stack, i + 1, k, l, e.n_mm, e.n_gapo, e.n_gape + 1, e.n_ins, e.n_del + 1, STATE_D, 1, opt); - } - } - } - } - } - // mismatches - if (allow_diff && allow_M) { // mismatch is allowed - for (j = 1; j <= 4; ++j) { - int c = (seq[i] + j) & 3; - int is_mm = (j != 4 || seq[i] > 3); - k = bwt->L2[c] + cnt_k[c] + 1; - l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm + is_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, is_mm, opt); - } - } else if (seq[i] < 4) { // try exact match only - int c = seq[i] & 3; - k = bwt->L2[c] + cnt_k[c] + 1; - l = bwt->L2[c] + cnt_l[c]; - if (k <= l) gap_push(stack, i, k, l, e.n_mm, e.n_gapo, e.n_gape, e.n_ins, e.n_del, STATE_M, 0, opt); - } - } - - *_n_aln = n_aln; - //fprintf(stderr, "max_entries = %d\n", max_entries); - return aln; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtgap.h --- a/bwa-0.7.9a/bwtgap.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,40 +0,0 @@ -#ifndef BWTGAP_H_ -#define BWTGAP_H_ - -#include "bwt.h" -#include "bwtaln.h" - -typedef struct { // recursion stack - u_int32_t info; // score<<21 | i - u_int32_t n_mm:8, n_gapo:8, n_gape:8, state:2, n_seed_mm:6; - u_int32_t n_ins:16, n_del:16; - int last_diff_pos; - bwtint_t k, l; // (k,l) is the SA region of [i,n-1] -} gap_entry_t; - -typedef struct { - int n_entries, m_entries; - gap_entry_t *stack; -} gap_stack1_t; - -typedef struct { - int n_stacks, best, n_entries; - gap_stack1_t *stacks; -} gap_stack_t; - -#ifdef __cplusplus -extern "C" { -#endif - - gap_stack_t *gap_init_stack2(int max_score); - gap_stack_t *gap_init_stack(int max_mm, int max_gapo, int max_gape, const gap_opt_t *opt); - void gap_destroy_stack(gap_stack_t *stack); - bwt_aln1_t *bwt_match_gap(bwt_t *const bwt, int len, const ubyte_t *seq, bwt_width_t *w, - bwt_width_t *seed_w, const gap_opt_t *opt, int *_n_aln, gap_stack_t *stack); - void bwa_aln2seq(int n_aln, const bwt_aln1_t *aln, bwa_seq_t *s); - -#ifdef __cplusplus -} -#endif - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtindex.c --- a/bwa-0.7.9a/bwtindex.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,287 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#include -#include -#include -#include -#include -#include -#include "bntseq.h" -#include "bwt.h" -#include "utils.h" - -#ifdef _DIVBWT -#include "divsufsort.h" -#endif - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - - -int is_bwt(ubyte_t *T, int n); - -int64_t bwa_seq_len(const char *fn_pac) -{ - FILE *fp; - int64_t pac_len; - ubyte_t c; - fp = xopen(fn_pac, "rb"); - err_fseek(fp, -1, SEEK_END); - pac_len = err_ftell(fp); - err_fread_noeof(&c, 1, 1, fp); - err_fclose(fp); - return (pac_len - 1) * 4 + (int)c; -} - -bwt_t *bwt_pac2bwt(const char *fn_pac, int use_is) -{ - bwt_t *bwt; - ubyte_t *buf, *buf2; - int i, pac_size; - FILE *fp; - - // initialization - bwt = (bwt_t*)calloc(1, sizeof(bwt_t)); - bwt->seq_len = bwa_seq_len(fn_pac); - bwt->bwt_size = (bwt->seq_len + 15) >> 4; - fp = xopen(fn_pac, "rb"); - - // prepare sequence - pac_size = (bwt->seq_len>>2) + ((bwt->seq_len&3) == 0? 0 : 1); - buf2 = (ubyte_t*)calloc(pac_size, 1); - err_fread_noeof(buf2, 1, pac_size, fp); - err_fclose(fp); - memset(bwt->L2, 0, 5 * 4); - buf = (ubyte_t*)calloc(bwt->seq_len + 1, 1); - for (i = 0; i < bwt->seq_len; ++i) { - buf[i] = buf2[i>>2] >> ((3 - (i&3)) << 1) & 3; - ++bwt->L2[1+buf[i]]; - } - for (i = 2; i <= 4; ++i) bwt->L2[i] += bwt->L2[i-1]; - free(buf2); - - // Burrows-Wheeler Transform - if (use_is) { - bwt->primary = is_bwt(buf, bwt->seq_len); - } else { -#ifdef _DIVBWT - bwt->primary = divbwt(buf, buf, 0, bwt->seq_len); -#else - err_fatal_simple("libdivsufsort is not compiled in."); -#endif - } - bwt->bwt = (u_int32_t*)calloc(bwt->bwt_size, 4); - for (i = 0; i < bwt->seq_len; ++i) - bwt->bwt[i>>4] |= buf[i] << ((15 - (i&15)) << 1); - free(buf); - return bwt; -} - -int bwa_pac2bwt(int argc, char *argv[]) // the "pac2bwt" command; IMPORTANT: bwt generated at this step CANNOT be used with BWA. bwtupdate is required! -{ - bwt_t *bwt; - int c, use_is = 1; - while ((c = getopt(argc, argv, "d")) >= 0) { - switch (c) { - case 'd': use_is = 0; break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa pac2bwt [-d] \n"); - return 1; - } - bwt = bwt_pac2bwt(argv[optind], use_is); - bwt_dump_bwt(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} - -#define bwt_B00(b, k) ((b)->bwt[(k)>>4]>>((~(k)&0xf)<<1)&3) - -void bwt_bwtupdate_core(bwt_t *bwt) -{ - bwtint_t i, k, c[4], n_occ; - uint32_t *buf; - - n_occ = (bwt->seq_len + OCC_INTERVAL - 1) / OCC_INTERVAL + 1; - bwt->bwt_size += n_occ * sizeof(bwtint_t); // the new size - buf = (uint32_t*)calloc(bwt->bwt_size, 4); // will be the new bwt - c[0] = c[1] = c[2] = c[3] = 0; - for (i = k = 0; i < bwt->seq_len; ++i) { - if (i % OCC_INTERVAL == 0) { - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - k += sizeof(bwtint_t); // in fact: sizeof(bwtint_t)=4*(sizeof(bwtint_t)/4) - } - if (i % 16 == 0) buf[k++] = bwt->bwt[i/16]; // 16 == sizeof(uint32_t)/2 - ++c[bwt_B00(bwt, i)]; - } - // the last element - memcpy(buf + k, c, sizeof(bwtint_t) * 4); - xassert(k + sizeof(bwtint_t) == bwt->bwt_size, "inconsistent bwt_size"); - // update bwt - free(bwt->bwt); bwt->bwt = buf; -} - -int bwa_bwtupdate(int argc, char *argv[]) // the "bwtupdate" command -{ - bwt_t *bwt; - if (argc < 2) { - fprintf(stderr, "Usage: bwa bwtupdate \n"); - return 1; - } - bwt = bwt_restore_bwt(argv[1]); - bwt_bwtupdate_core(bwt); - bwt_dump_bwt(argv[1], bwt); - bwt_destroy(bwt); - return 0; -} - -int bwa_bwt2sa(int argc, char *argv[]) // the "bwt2sa" command -{ - bwt_t *bwt; - int c, sa_intv = 32; - while ((c = getopt(argc, argv, "i:")) >= 0) { - switch (c) { - case 'i': sa_intv = atoi(optarg); break; - default: return 1; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: bwa bwt2sa [-i %d] \n", sa_intv); - return 1; - } - bwt = bwt_restore_bwt(argv[optind]); - bwt_cal_sa(bwt, sa_intv); - bwt_dump_sa(argv[optind+1], bwt); - bwt_destroy(bwt); - return 0; -} - -int bwa_index(int argc, char *argv[]) // the "index" command -{ - extern void bwa_pac_rev_core(const char *fn, const char *fn_rev); - - char *prefix = 0, *str, *str2, *str3; - int c, algo_type = 0, is_64 = 0; - clock_t t; - int64_t l_pac; - - while ((c = getopt(argc, argv, "6a:p:")) >= 0) { - switch (c) { - case 'a': // if -a is not set, algo_type will be determined later - if (strcmp(optarg, "div") == 0) algo_type = 1; - else if (strcmp(optarg, "bwtsw") == 0) algo_type = 2; - else if (strcmp(optarg, "is") == 0) algo_type = 3; - else err_fatal(__func__, "unknown algorithm: '%s'.", optarg); - break; - case 'p': prefix = strdup(optarg); break; - case '6': is_64 = 1; break; - default: return 1; - } - } - - if (optind + 1 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa index [-a bwtsw|is] [-c] \n\n"); - fprintf(stderr, "Options: -a STR BWT construction algorithm: bwtsw or is [auto]\n"); - fprintf(stderr, " -p STR prefix of the index [same as fasta name]\n"); - fprintf(stderr, " -6 index files named as .64.* instead of .* \n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Warning: `-a bwtsw' does not work for short genomes, while `-a is' and\n"); - fprintf(stderr, " `-a div' do not work not for long genomes. Please choose `-a'\n"); - fprintf(stderr, " according to the length of the genome.\n\n"); - return 1; - } - if (prefix == 0) { - prefix = malloc(strlen(argv[optind]) + 4); - strcpy(prefix, argv[optind]); - if (is_64) strcat(prefix, ".64"); - } - str = (char*)calloc(strlen(prefix) + 10, 1); - str2 = (char*)calloc(strlen(prefix) + 10, 1); - str3 = (char*)calloc(strlen(prefix) + 10, 1); - - { // nucleotide indexing - gzFile fp = xzopen(argv[optind], "r"); - t = clock(); - fprintf(stderr, "[bwa_index] Pack FASTA... "); - l_pac = bns_fasta2bntseq(fp, prefix, 0); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - err_gzclose(fp); - } - if (algo_type == 0) algo_type = l_pac > 50000000? 2 : 3; // set the algorithm for generating BWT - { - strcpy(str, prefix); strcat(str, ".pac"); - strcpy(str2, prefix); strcat(str2, ".bwt"); - t = clock(); - fprintf(stderr, "[bwa_index] Construct BWT for the packed sequence...\n"); - if (algo_type == 2) bwt_bwtgen(str, str2); - else if (algo_type == 1 || algo_type == 3) { - bwt_t *bwt; - bwt = bwt_pac2bwt(str, algo_type == 3); - bwt_dump_bwt(str2, bwt); - bwt_destroy(bwt); - } - fprintf(stderr, "[bwa_index] %.2f seconds elapse.\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } - { - bwt_t *bwt; - strcpy(str, prefix); strcat(str, ".bwt"); - t = clock(); - fprintf(stderr, "[bwa_index] Update BWT... "); - bwt = bwt_restore_bwt(str); - bwt_bwtupdate_core(bwt); - bwt_dump_bwt(str, bwt); - bwt_destroy(bwt); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } - { - gzFile fp = xzopen(argv[optind], "r"); - t = clock(); - fprintf(stderr, "[bwa_index] Pack forward-only FASTA... "); - l_pac = bns_fasta2bntseq(fp, prefix, 1); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - err_gzclose(fp); - } - { - bwt_t *bwt; - strcpy(str, prefix); strcat(str, ".bwt"); - strcpy(str3, prefix); strcat(str3, ".sa"); - t = clock(); - fprintf(stderr, "[bwa_index] Construct SA from BWT and Occ... "); - bwt = bwt_restore_bwt(str); - bwt_cal_sa(bwt, 32); - bwt_dump_sa(str3, bwt); - bwt_destroy(bwt); - fprintf(stderr, "%.2f sec\n", (float)(clock() - t) / CLOCKS_PER_SEC); - } - free(str3); free(str2); free(str); free(prefix); - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtsw2.h --- a/bwa-0.7.9a/bwtsw2.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,69 +0,0 @@ -#ifndef LH3_BWTSW2_H -#define LH3_BWTSW2_H - -#include -#include "bntseq.h" -#include "bwt_lite.h" -#include "bwt.h" - -#define BSW2_FLAG_MATESW 0x100 -#define BSW2_FLAG_TANDEM 0x200 -#define BSW2_FLAG_MOVED 0x400 -#define BSW2_FLAG_RESCUED 0x800 - -typedef struct { - int skip_sw:8, cpy_cmt:8, hard_clip:16; - int a, b, q, r, t, qr, bw, max_ins, max_chain_gap; - int z, is, t_seeds, multi_2nd; - float mask_level, coef; - int n_threads, chunk_size; -} bsw2opt_t; - -typedef struct { - bwtint_t k, l; - uint32_t flag:18, n_seeds:13, is_rev:1; - int len, G, G2; - int beg, end; -} bsw2hit_t; - -typedef struct { - int flag, nn, n_cigar, chr, pos, qual, mchr, mpos, pqual, isize, nm; - uint32_t *cigar; -} bsw2aux_t; - -typedef struct { - int n, max; - bsw2hit_t *hits; - bsw2aux_t *aux; -} bwtsw2_t; - -typedef struct { - void *stack; - int max_l; - uint8_t *aln_mem; -} bsw2global_t; - -typedef struct { - int l, tid; - char *name, *seq, *qual, *sam, *comment; -} bsw2seq1_t; - -#ifdef __cplusplus -extern "C" { -#endif - - bsw2opt_t *bsw2_init_opt(); - bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool); - void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2); - void bsw2_destroy(bwtsw2_t *b); - - bsw2global_t *bsw2_global_init(); - void bsw2_global_destroy(bsw2global_t *_pool); - - void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hit); - -#ifdef __cplusplus -} -#endif - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtsw2_aux.c --- a/bwa-0.7.9a/bwtsw2_aux.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,776 +0,0 @@ -#include -#include -#include -#ifdef HAVE_CONFIG_H -#include "config.h" -#endif -#ifdef HAVE_PTHREAD -#include -#endif -#include "bntseq.h" -#include "bwt_lite.h" -#include "utils.h" -#include "bwtsw2.h" -#include "kstring.h" -#include "bwa.h" -#include "ksw.h" - -#include "kseq.h" -KSEQ_DECLARE(gzFile) - -#include "ksort.h" -#define __left_lt(a, b) ((a).end > (b).end) -KSORT_INIT(hit, bsw2hit_t, __left_lt) - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - - -extern unsigned char nst_nt4_table[256]; - -unsigned char nt_comp_table[256] = { - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','T','V','G', 'H','N','N','C', 'D','N','N','M', 'N','K','N','N', - 'N','N','Y','S', 'A','N','B','W', 'X','R','N','N', 'N','N','N','N', - 'n','t','v','g', 'h','n','n','c', 'd','n','n','m', 'n','k','n','n', - 'n','n','y','s', 'a','n','b','w', 'x','r','n','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', - 'N','N','N','N', 'N','N','N','N', 'N','N','N','N', 'N','N','N','N' -}; - -extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); -extern int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level); - -bsw2opt_t *bsw2_init_opt() -{ - bsw2opt_t *o = (bsw2opt_t*)calloc(1, sizeof(bsw2opt_t)); - o->a = 1; o->b = 3; o->q = 5; o->r = 2; o->t = 30; - o->bw = 50; - o->max_ins = 20000; - o->z = 1; o->is = 3; o->t_seeds = 5; o->hard_clip = 0; o->skip_sw = 0; - o->mask_level = 0.50f; o->coef = 5.5f; - o->qr = o->q + o->r; o->n_threads = 1; o->chunk_size = 10000000; - o->max_chain_gap = 10000; - o->cpy_cmt = 0; - return o; -} - -void bsw2_destroy(bwtsw2_t *b) -{ - int i; - if (b == 0) return; - if (b->aux) - for (i = 0; i < b->n; ++i) free(b->aux[i].cigar); - free(b->aux); free(b->hits); - free(b); -} - -bwtsw2_t *bsw2_dup_no_cigar(const bwtsw2_t *b) -{ - bwtsw2_t *p; - p = calloc(1, sizeof(bwtsw2_t)); - p->max = p->n = b->n; - if (b->n) { - kroundup32(p->max); - p->hits = calloc(p->max, sizeof(bsw2hit_t)); - memcpy(p->hits, b->hits, p->n * sizeof(bsw2hit_t)); - } - return p; -} - -#define __gen_ap(par, opt) do { \ - int i; \ - for (i = 0; i < 25; ++i) (par).matrix[i] = -(opt)->b; \ - for (i = 0; i < 4; ++i) (par).matrix[i*5+i] = (opt)->a; \ - (par).gap_open = (opt)->q; (par).gap_ext = (opt)->r; \ - (par).gap_end = (opt)->r; \ - (par).row = 5; (par).band_width = opt->bw; \ - } while (0) - -void bsw2_extend_left(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *_query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) -{ - int i; - bwtint_t k; - uint8_t *target = 0, *query; - int8_t mat[25]; - - bwa_fill_scmat(opt->a, opt->b, mat); - query = calloc(lq, 1); - // sort according to the descending order of query end - ks_introsort(hit, b->n, b->hits); - target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); - // reverse _query - for (i = 0; i < lq; ++i) query[lq - i - 1] = _query[i]; - // core loop - for (i = 0; i < b->n; ++i) { - bsw2hit_t *p = b->hits + i; - int lt = ((p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int score, j, qle, tle; - p->n_seeds = 1; - if (p->l || p->k == 0) continue; - for (j = score = 0; j < i; ++j) { - bsw2hit_t *q = b->hits + j; - if (q->beg <= p->beg && q->k <= p->k && q->k + q->len >= p->k + p->len) { - if (q->n_seeds < (1<<13) - 2) ++q->n_seeds; - ++score; - } - } - if (score) continue; - if (lt > p->k) lt = p->k; - for (k = p->k - 1, j = 0; k > 0 && j < lt; --k) // FIXME: k=0 not considered! - target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; - lt = j; - score = ksw_extend(p->beg, &query[lq - p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, p->G, &qle, &tle, 0, 0, 0); - if (score > p->G) { // extensible - p->G = score; - p->k -= tle; - p->len += tle; - p->beg -= qle; - } - } - free(query); free(target); -} - -void bsw2_extend_rght(const bsw2opt_t *opt, bwtsw2_t *b, uint8_t *query, int lq, uint8_t *pac, bwtint_t l_pac, uint8_t *_mem) -{ - int i; - bwtint_t k; - uint8_t *target; - int8_t mat[25]; - - bwa_fill_scmat(opt->a, opt->b, mat); - target = calloc(((lq + 1) / 2 * opt->a + opt->r) / opt->r + lq, 1); - for (i = 0; i < b->n; ++i) { - bsw2hit_t *p = b->hits + i; - int lt = ((lq - p->beg + 1) / 2 * opt->a + opt->r) / opt->r + lq; - int j, score, qle, tle; - if (p->l) continue; - for (k = p->k, j = 0; k < p->k + lt && k < l_pac; ++k) - target[j++] = pac[k>>2] >> (~k&3)*2 & 0x3; - lt = j; - score = ksw_extend(lq - p->beg, &query[p->beg], lt, target, 5, mat, opt->q, opt->r, opt->bw, 0, -1, 1, &qle, &tle, 0, 0, 0) - 1; -// if (score < p->G) fprintf(stderr, "[bsw2_extend_hits] %d < %d\n", score, p->G); - if (score >= p->G) { - p->G = score; - p->len = tle; - p->end = p->beg + qle; - } - } - free(target); -} - -/* generate CIGAR array(s) in b->cigar[] */ -static void gen_cigar(const bsw2opt_t *opt, int lq, uint8_t *seq[2], int64_t l_pac, const uint8_t *pac, bwtsw2_t *b, const char *name) -{ - int i; - int8_t mat[25]; - - bwa_fill_scmat(opt->a, opt->b, mat); - for (i = 0; i < b->n; ++i) { - bsw2hit_t *p = b->hits + i; - bsw2aux_t *q = b->aux + i; - uint8_t *query; - int beg, end, score; - if (p->l) continue; - beg = (p->flag & 0x10)? lq - p->end : p->beg; - end = (p->flag & 0x10)? lq - p->beg : p->end; - query = seq[(p->flag & 0x10)? 1 : 0] + beg; - q->cigar = bwa_gen_cigar(mat, opt->q, opt->r, opt->bw, l_pac, pac, end - beg, query, p->k, p->k + p->len, &score, &q->n_cigar, &q->nm); -#if 0 - if (name && score != p->G) { // debugging only - int j, glen = 0; - for (j = 0; j < q->n_cigar; ++j) - if ((q->cigar[j]&0xf) == 1 || (q->cigar[j]&0xf) == 2) - glen += q->cigar[j]>>4; - fprintf(stderr, "[E::%s] %s - unequal score: %d != %d; (qlen, aqlen, arlen, glen, bw) = (%d, %d, %d, %d, %d)\n", - __func__, name, score, p->G, lq, end - beg, p->len, glen, opt->bw); - } -#endif - if (q->cigar && (beg != 0 || end < lq)) { // write soft clipping - q->cigar = realloc(q->cigar, 4 * (q->n_cigar + 2)); - if (beg != 0) { - memmove(q->cigar + 1, q->cigar, q->n_cigar * 4); - q->cigar[0] = beg<<4 | 4; - ++q->n_cigar; - } - if (end < lq) { - q->cigar[q->n_cigar] = (lq - end)<<4 | 4; - ++q->n_cigar; - } - } - } -} - -/* this is for the debugging purpose only */ -void bsw2_debug_hits(const bwtsw2_t *b) -{ - int i; - printf("# raw hits: %d\n", b->n); - for (i = 0; i < b->n; ++i) { - bsw2hit_t *p = b->hits + i; - if (p->G > 0) - printf("G=%d, G2=%d, len=%d, [%d,%d), k=%lu, l=%lu, #seeds=%d, is_rev=%d\n", p->G, p->G2, p->len, p->beg, p->end, (long)p->k, (long)p->l, p->n_seeds, p->is_rev); - } -} - -static void merge_hits(bwtsw2_t *b[2], int l, int is_reverse) -{ - int i; - if (b[0]->n + b[1]->n > b[0]->max) { - b[0]->max = b[0]->n + b[1]->n; - b[0]->hits = realloc(b[0]->hits, b[0]->max * sizeof(bsw2hit_t)); - } - for (i = 0; i < b[1]->n; ++i) { - bsw2hit_t *p = b[0]->hits + b[0]->n + i; - *p = b[1]->hits[i]; - if (is_reverse) { - int x = p->beg; - p->beg = l - p->end; - p->end = l - x; - p->flag |= 0x10; - } - } - b[0]->n += b[1]->n; - bsw2_destroy(b[1]); - b[1] = 0; -} -/* seq[0] is the forward sequence and seq[1] is the reverse complement. */ -static bwtsw2_t *bsw2_aln1_core(const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, - int l, uint8_t *seq[2], bsw2global_t *pool) -{ - extern void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]); - bwtsw2_t *b[2], **bb[2], **_b, *p; - int k, j; - bwtl_t *query; - query = bwtl_seq2bwtl(l, seq[0]); - _b = bsw2_core(bns, opt, query, target, pool); - bwtl_destroy(query); - for (k = 0; k < 2; ++k) { - bb[k] = calloc(2, sizeof(void*)); - bb[k][0] = calloc(1, sizeof(bwtsw2_t)); - bb[k][1] = calloc(1, sizeof(bwtsw2_t)); - } - for (k = 0; k < 2; ++k) { // separate _b into bb[2] based on the strand - for (j = 0; j < _b[k]->n; ++j) { - bsw2hit_t *q; - p = bb[_b[k]->hits[j].is_rev][k]; - if (p->n == p->max) { - p->max = p->max? p->max<<1 : 8; - p->hits = realloc(p->hits, p->max * sizeof(bsw2hit_t)); - } - q = &p->hits[p->n++]; - *q = _b[k]->hits[j]; - if (_b[k]->hits[j].is_rev) { - int x = q->beg; - q->beg = l - q->end; - q->end = l - x; - } - } - } - b[0] = bb[0][1]; b[1] = bb[1][1]; // bb[*][1] are "narrow SA hits" - bsw2_chain_filter(opt, l, b); // NB: only unique seeds are chained - for (k = 0; k < 2; ++k) { - bsw2_extend_left(opt, bb[k][1], seq[k], l, pac, bns->l_pac, pool->aln_mem); - merge_hits(bb[k], l, 0); // bb[k][1] is merged to bb[k][0] here - bsw2_resolve_duphits(0, 0, bb[k][0], 0); - bsw2_extend_rght(opt, bb[k][0], seq[k], l, pac, bns->l_pac, pool->aln_mem); - bsw2_resolve_duphits(0, 0, bb[k][0], 0); - b[k] = bb[k][0]; - free(bb[k]); - } - merge_hits(b, l, 1); // again, b[1] is merged to b[0] - bsw2_resolve_query_overlaps(b[0], opt->mask_level); - bsw2_destroy(_b[0]); bsw2_destroy(_b[1]); free(_b); - return b[0]; -} - -/* set ->flag to records the origin of the hit (to forward bwt or reverse bwt) */ -static void flag_fr(bwtsw2_t *b[2]) -{ - int i, j; - for (i = 0; i < b[0]->n; ++i) { - bsw2hit_t *p = b[0]->hits + i; - p->flag |= 0x10000; - } - for (i = 0; i < b[1]->n; ++i) { - bsw2hit_t *p = b[1]->hits + i; - p->flag |= 0x20000; - } - for (i = 0; i < b[0]->n; ++i) { - bsw2hit_t *p = b[0]->hits + i; - for (j = 0; j < b[1]->n; ++j) { - bsw2hit_t *q = b[1]->hits + j; - if (q->beg == p->beg && q->end == p->end && q->k == p->k && q->len == p->len && q->G == p->G) { - q->flag |= 0x30000; p->flag |= 0x30000; - break; - } - } - } -} - -typedef struct { - int n, max; - bsw2seq1_t *seq; -} bsw2seq_t; - -static int fix_cigar(const bntseq_t *bns, bsw2hit_t *p, int n_cigar, uint32_t *cigar) -{ - // FIXME: this routine does not work if the query bridge three reference sequences - int32_t coor, refl, lq; - int x, y, i, seqid; - bns_cnt_ambi(bns, p->k, p->len, &seqid); - coor = p->k - bns->anns[seqid].offset; - refl = bns->anns[seqid].len; - x = coor; y = 0; - // test if the alignment goes beyond the boundary - for (i = 0; i < n_cigar; ++i) { - int op = cigar[i]&0xf, ln = cigar[i]>>4; - if (op == 1 || op == 4 || op == 5) y += ln; - else if (op == 2) x += ln; - else x += ln, y += ln; - } - lq = y; // length of the query sequence - if (x > refl) { // then fix it - int j, nc, mq[2], nlen[2]; - uint32_t *cn; - bwtint_t kk = 0; - nc = mq[0] = mq[1] = nlen[0] = nlen[1] = 0; - cn = calloc(n_cigar + 3, 4); - x = coor; y = 0; - for (i = j = 0; i < n_cigar; ++i) { - int op = cigar[i]&0xf, ln = cigar[i]>>4; - if (op == 4 || op == 5 || op == 1) { // ins or clipping - y += ln; - cn[j++] = cigar[i]; - } else if (op == 2) { // del - if (x + ln >= refl && nc == 0) { - cn[j++] = (uint32_t)(lq - y)<<4 | 4; - nc = j; - cn[j++] = (uint32_t)y<<4 | 4; - kk = p->k + (x + ln - refl); - nlen[0] = x - coor; - nlen[1] = p->len - nlen[0] - ln; - } else cn[j++] = cigar[i]; - x += ln; - } else if (op == 0) { // match - if (x + ln >= refl && nc == 0) { - // FIXME: not consider a special case where a split right between M and I - cn[j++] = (uint32_t)(refl - x)<<4 | 0; // write M - cn[j++] = (uint32_t)(lq - y - (refl - x))<<4 | 4; // write S - nc = j; - mq[0] += refl - x; - cn[j++] = (uint32_t)(y + (refl - x))<<4 | 4; - if (x + ln - refl) cn[j++] = (uint32_t)(x + ln - refl)<<4 | 0; - mq[1] += x + ln - refl; - kk = bns->anns[seqid].offset + refl; - nlen[0] = refl - coor; - nlen[1] = p->len - nlen[0]; - } else { - cn[j++] = cigar[i]; - mq[nc?1:0] += ln; - } - x += ln; y += ln; - } - } - if (mq[0] > mq[1]) { // then take the first alignment - n_cigar = nc; - memcpy(cigar, cn, 4 * nc); - p->len = nlen[0]; - } else { - p->k = kk; p->len = nlen[1]; - n_cigar = j - nc; - memcpy(cigar, cn + nc, 4 * (j - nc)); - } - free(cn); - } - return n_cigar; -} - -static void write_aux(const bsw2opt_t *opt, const bntseq_t *bns, int qlen, uint8_t *seq[2], const uint8_t *pac, bwtsw2_t *b, const char *name) -{ - int i; - // allocate for b->aux - if (b->n<<1 < b->max) { - b->max = b->n; - kroundup32(b->max); - b->hits = realloc(b->hits, b->max * sizeof(bsw2hit_t)); - } - b->aux = calloc(b->n, sizeof(bsw2aux_t)); - // generate CIGAR - gen_cigar(opt, qlen, seq, bns->l_pac, pac, b, name); - // fix CIGAR, generate mapQ, and write chromosomal position - for (i = 0; i < b->n; ++i) { - bsw2hit_t *p = &b->hits[i]; - bsw2aux_t *q = &b->aux[i]; - q->flag = p->flag & 0xfe; - q->isize = 0; - if (p->l == 0) { // unique hit - float c = 1.0; - int subo; - // fix out-of-boundary CIGAR - q->n_cigar = fix_cigar(bns, p, q->n_cigar, q->cigar); - // compute mapQ - subo = p->G2 > opt->t? p->G2 : opt->t; - if (p->flag>>16 == 1 || p->flag>>16 == 2) c *= .5; - if (p->n_seeds < 2) c *= .2; - q->qual = (int)(c * (p->G - subo) * (250.0 / p->G + 0.03 / opt->a) + .499); - if (q->qual > 250) q->qual = 250; - if (q->qual < 0) q->qual = 0; - if (p->flag&1) q->qual = 0; // this is a random hit - q->pqual = q->qual; // set the paired qual as qual - // get the chromosomal position - q->nn = bns_cnt_ambi(bns, p->k, p->len, &q->chr); - q->pos = p->k - bns->anns[q->chr].offset; - } else q->qual = 0, q->n_cigar = 0, q->chr = q->pos = -1, q->nn = 0; - } -} - -static void update_mate_aux(bwtsw2_t *b, const bwtsw2_t *m) -{ - int i; - if (m == 0) return; - // update flag, mchr and mpos - for (i = 0; i < b->n; ++i) { - bsw2aux_t *q = &b->aux[i]; - q->flag |= 1; // paired - if (m->n == 0) q->flag |= 8; // mate unmapped - if (m->n == 1) { - q->mchr = m->aux[0].chr; - q->mpos = m->aux[0].pos; - if (m->aux[0].flag&0x10) q->flag |= 0x20; // mate reverse strand - if (q->chr == q->mchr) { // set insert size - if (q->mpos + m->hits[0].len > q->pos) - q->isize = q->mpos + m->hits[0].len - q->pos; - else q->isize = q->mpos - q->pos - b->hits[0].len; - } else q->isize = 0; - } else q->mchr = q->mpos = -1; - } - // update mapping quality - if (b->n == 1 && m->n == 1) { - bsw2hit_t *p = &b->hits[0]; - if (p->flag & BSW2_FLAG_MATESW) { // this alignment is found by Smith-Waterman - if (!(p->flag & BSW2_FLAG_TANDEM) && b->aux[0].pqual < 20) - b->aux[0].pqual = 20; - if (b->aux[0].pqual >= m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; - } else if ((p->flag & 2) && !(m->hits[0].flag & BSW2_FLAG_MATESW)) { // properly paired - if (!(p->flag & BSW2_FLAG_TANDEM)) { // pqual is bounded by [b->aux[0].qual,m->aux[0].qual] - b->aux[0].pqual += 20; - if (b->aux[0].pqual > m->aux[0].qual) b->aux[0].pqual = m->aux[0].qual; - if (b->aux[0].pqual < b->aux[0].qual) b->aux[0].pqual = b->aux[0].qual; - } - } - } -} - -/* generate SAM lines for a sequence in ks with alignment stored in - * b. ks->name and ks->seq will be freed and set to NULL in the end. */ -static void print_hits(const bntseq_t *bns, const bsw2opt_t *opt, bsw2seq1_t *ks, bwtsw2_t *b, int is_pe, bwtsw2_t *bmate) -{ - int i, k; - kstring_t str; - memset(&str, 0, sizeof(kstring_t)); - if (b == 0 || b->n == 0) { // no hits - ksprintf(&str, "%s\t4\t*\t0\t0\t*\t*\t0\t0\t", ks->name); - for (i = 0; i < ks->l; ++i) kputc(ks->seq[i], &str); - if (ks->qual) { - kputc('\t', &str); - for (i = 0; i < ks->l; ++i) kputc(ks->qual[i], &str); - } else kputs("\t*", &str); - kputc('\n', &str); - } - for (i = 0; b && i < b->n; ++i) { - bsw2hit_t *p = b->hits + i; - bsw2aux_t *q = b->aux + i; - int j, beg, end, type = 0; - // print mandatory fields before SEQ - if (q->cigar == 0) q->flag |= 0x4; - ksprintf(&str, "%s\t%d", ks->name, q->flag | (opt->multi_2nd && i? 0x100 : 0)); - ksprintf(&str, "\t%s\t%ld", q->chr>=0? bns->anns[q->chr].name : "*", (long)q->pos + 1); - if (p->l == 0 && q->cigar) { // not a repetitive hit - ksprintf(&str, "\t%d\t", q->pqual); - for (k = 0; k < q->n_cigar; ++k) - ksprintf(&str, "%d%c", q->cigar[k]>>4, (opt->hard_clip? "MIDNHHP" : "MIDNSHP")[q->cigar[k]&0xf]); - } else ksprintf(&str, "\t0\t*"); - if (!is_pe) kputs("\t*\t0\t0\t", &str); - else ksprintf(&str, "\t%s\t%d\t%d\t", q->mchr==q->chr? "=" : (q->mchr<0? "*" : bns->anns[q->mchr].name), q->mpos+1, q->isize); - // get the sequence begin and end - beg = 0; end = ks->l; - if (opt->hard_clip && q->cigar) { - if ((q->cigar[0]&0xf) == 4) beg += q->cigar[0]>>4; - if ((q->cigar[q->n_cigar-1]&0xf) == 4) end -= q->cigar[q->n_cigar-1]>>4; - } - for (j = beg; j < end; ++j) { - if (p->flag&0x10) kputc(nt_comp_table[(int)ks->seq[ks->l - 1 - j]], &str); - else kputc(ks->seq[j], &str); - } - // print base quality if present - if (ks->qual) { - kputc('\t', &str); - for (j = beg; j < end; ++j) { - if (p->flag&0x10) kputc(ks->qual[ks->l - 1 - j], &str); - else kputc(ks->qual[j], &str); - } - } else kputs("\t*", &str); - // print optional tags - ksprintf(&str, "\tAS:i:%d\tXS:i:%d\tXF:i:%d\tXE:i:%d\tNM:i:%d", p->G, p->G2, p->flag>>16, p->n_seeds, q->nm); - if (q->nn) ksprintf(&str, "\tXN:i:%d", q->nn); - if (p->l) ksprintf(&str, "\tXI:i:%d", p->l - p->k + 1); - if (p->flag&BSW2_FLAG_MATESW) type |= 1; - if (p->flag&BSW2_FLAG_TANDEM) type |= 2; - if (type) ksprintf(&str, "\tXT:i:%d", type); - if (opt->cpy_cmt && ks->comment) { - int l = strlen(ks->comment); - if (l >= 6 && ks->comment[2] == ':' && ks->comment[4] == ':') { - kputc('\t', &str); kputs(ks->comment, &str); - } - } - kputc('\n', &str); - } - ks->sam = str.s; - free(ks->seq); ks->seq = 0; - free(ks->qual); ks->qual = 0; - free(ks->name); ks->name = 0; -} - -static void update_opt(bsw2opt_t *dst, const bsw2opt_t *src, int qlen) -{ - double ll = log(qlen); - int i, k; - *dst = *src; - if (dst->t < ll * dst->coef) dst->t = (int)(ll * dst->coef + .499); - // set band width: the query length sets a boundary on the maximum band width - k = (qlen * dst->a - 2 * dst->q) / (2 * dst->r + dst->a); - i = (qlen * dst->a - dst->a - dst->t) / dst->r; - if (k > i) k = i; - if (k < 1) k = 1; // I do not know if k==0 causes troubles - dst->bw = src->bw < k? src->bw : k; -} - -/* Core routine to align reads in _seq. It is separated from - * process_seqs() to realize multi-threading */ -static void bsw2_aln_core(bsw2seq_t *_seq, const bsw2opt_t *_opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) -{ - int x; - bsw2opt_t opt; - bsw2global_t *pool = bsw2_global_init(); - bwtsw2_t **buf; - buf = calloc(_seq->n, sizeof(void*)); - for (x = 0; x < _seq->n; ++x) { - bsw2seq1_t *p = _seq->seq + x; - uint8_t *seq[2], *rseq[2]; - int i, l, k; - bwtsw2_t *b[2]; - l = p->l; - update_opt(&opt, _opt, p->l); - if (pool->max_l < l) { // then enlarge working space for aln_extend_core() - int tmp = ((l + 1) / 2 * opt.a + opt.r) / opt.r + l; - pool->max_l = l; - pool->aln_mem = realloc(pool->aln_mem, (tmp + 2) * 24); - } - // set seq[2] and rseq[2] - seq[0] = calloc(l * 4, 1); - seq[1] = seq[0] + l; - rseq[0] = seq[1] + l; rseq[1] = rseq[0] + l; - // convert sequences to 2-bit representation - for (i = k = 0; i < l; ++i) { - int c = nst_nt4_table[(int)p->seq[i]]; - if (c >= 4) { c = (int)(drand48() * 4); ++k; } // FIXME: ambiguous bases are not properly handled - seq[0][i] = c; - seq[1][l-1-i] = 3 - c; - rseq[0][l-1-i] = 3 - c; - rseq[1][i] = c; - } - if (l - k < opt.t) { // too few unambiguous bases - buf[x] = calloc(1, sizeof(bwtsw2_t)); - free(seq[0]); continue; - } - // alignment - b[0] = bsw2_aln1_core(&opt, bns, pac, target, l, seq, pool); - for (k = 0; k < b[0]->n; ++k) - if (b[0]->hits[k].n_seeds < opt.t_seeds) break; - if (k < b[0]->n) { - b[1] = bsw2_aln1_core(&opt, bns, pac, target, l, rseq, pool); - for (i = 0; i < b[1]->n; ++i) { - bsw2hit_t *p = &b[1]->hits[i]; - int x = p->beg; - p->flag ^= 0x10, p->is_rev ^= 1; // flip the strand - p->beg = l - p->end; - p->end = l - x; - } - flag_fr(b); - merge_hits(b, l, 0); - bsw2_resolve_duphits(0, 0, b[0], 0); - bsw2_resolve_query_overlaps(b[0], opt.mask_level); - } else b[1] = 0; - // generate CIGAR and print SAM - buf[x] = bsw2_dup_no_cigar(b[0]); - // free - free(seq[0]); - bsw2_destroy(b[0]); - } - if (is_pe) bsw2_pair(&opt, bns->l_pac, pac, _seq->n, _seq->seq, buf); - for (x = 0; x < _seq->n; ++x) { - bsw2seq1_t *p = _seq->seq + x; - uint8_t *seq[2]; - int i; - seq[0] = malloc(p->l * 2); seq[1] = seq[0] + p->l; - for (i = 0; i < p->l; ++i) { - int c = nst_nt4_table[(int)p->seq[i]]; - if (c >= 4) c = (int)(drand48() * 4); - seq[0][i] = c; - seq[1][p->l-1-i] = 3 - c; - } - update_opt(&opt, _opt, p->l); - write_aux(&opt, bns, p->l, seq, pac, buf[x], _seq->seq[x].name); - free(seq[0]); - } - for (x = 0; x < _seq->n; ++x) { - if (is_pe) update_mate_aux(buf[x], buf[x^1]); - print_hits(bns, &opt, &_seq->seq[x], buf[x], is_pe, buf[x^1]); - } - for (x = 0; x < _seq->n; ++x) bsw2_destroy(buf[x]); - free(buf); - bsw2_global_destroy(pool); -} - -#ifdef HAVE_PTHREAD -typedef struct { - int tid, is_pe; - bsw2seq_t *_seq; - const bsw2opt_t *_opt; - const bntseq_t *bns; - uint8_t *pac; - const bwt_t *target; -} thread_aux_t; - -/* another interface to bsw2_aln_core() to facilitate pthread_create() */ -static void *worker(void *data) -{ - thread_aux_t *p = (thread_aux_t*)data; - bsw2_aln_core(p->_seq, p->_opt, p->bns, p->pac, p->target, p->is_pe); - return 0; -} -#endif - -/* process sequences stored in _seq, generate SAM lines for these - * sequences and reset _seq afterwards. */ -static void process_seqs(bsw2seq_t *_seq, const bsw2opt_t *opt, const bntseq_t *bns, uint8_t *pac, const bwt_t *target, int is_pe) -{ - int i; - is_pe = is_pe? 1 : 0; - -#ifdef HAVE_PTHREAD - if (opt->n_threads <= 1) { - bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); - } else { - pthread_t *tid; - pthread_attr_t attr; - thread_aux_t *data; - int j; - pthread_attr_init(&attr); - pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE); - data = (thread_aux_t*)calloc(opt->n_threads, sizeof(thread_aux_t)); - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (j = 0; j < opt->n_threads; ++j) { - thread_aux_t *p = data + j; - p->tid = j; p->_opt = opt; p->bns = bns; p->is_pe = is_pe; - p->pac = pac; p->target = target; - p->_seq = calloc(1, sizeof(bsw2seq_t)); - p->_seq->max = (_seq->n + opt->n_threads - 1) / opt->n_threads + 1; - p->_seq->n = 0; - p->_seq->seq = calloc(p->_seq->max, sizeof(bsw2seq1_t)); - } - for (i = 0; i < _seq->n; ++i) { // assign sequences to each thread - bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; - p->seq[p->n++] = _seq->seq[i]; - } - for (j = 0; j < opt->n_threads; ++j) pthread_create(&tid[j], &attr, worker, &data[j]); - for (j = 0; j < opt->n_threads; ++j) pthread_join(tid[j], 0); - for (j = 0; j < opt->n_threads; ++j) data[j]._seq->n = 0; - for (i = 0; i < _seq->n; ++i) { // copy the result from each thread back - bsw2seq_t *p = data[(i>>is_pe)%opt->n_threads]._seq; - _seq->seq[i] = p->seq[p->n++]; - } - for (j = 0; j < opt->n_threads; ++j) { - thread_aux_t *p = data + j; - free(p->_seq->seq); - free(p->_seq); - } - free(data); free(tid); - } -#else - bsw2_aln_core(_seq, opt, bns, pac, target, is_pe); -#endif - - // print and reset - for (i = 0; i < _seq->n; ++i) { - bsw2seq1_t *p = _seq->seq + i; - if (p->sam) err_printf("%s", p->sam); - free(p->name); free(p->seq); free(p->qual); free(p->sam); - p->tid = -1; p->l = 0; - p->name = p->seq = p->qual = p->sam = 0; - } - err_fflush(stdout); - _seq->n = 0; -} - -void bsw2_aln(const bsw2opt_t *opt, const bntseq_t *bns, bwt_t * const target, const char *fn, const char *fn2) -{ - gzFile fp, fp2; - kseq_t *ks, *ks2; - int l, is_pe = 0, i, n; - uint8_t *pac; - bsw2seq_t *_seq; - bseq1_t *bseq; - - pac = calloc(bns->l_pac/4+1, 1); - for (l = 0; l < bns->n_seqs; ++l) - err_printf("@SQ\tSN:%s\tLN:%d\n", bns->anns[l].name, bns->anns[l].len); - err_fread_noeof(pac, 1, bns->l_pac/4+1, bns->fp_pac); - fp = xzopen(fn, "r"); - ks = kseq_init(fp); - _seq = calloc(1, sizeof(bsw2seq_t)); - if (fn2) { - fp2 = xzopen(fn2, "r"); - ks2 = kseq_init(fp2); - is_pe = 1; - } else fp2 = 0, ks2 = 0, is_pe = 0; - while ((bseq = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { - int size = 0; - if (n > _seq->max) { - _seq->max = n; - kroundup32(_seq->max); - _seq->seq = realloc(_seq->seq, _seq->max * sizeof(bsw2seq1_t)); - } - _seq->n = n; - for (i = 0; i < n; ++i) { - bseq1_t *b = &bseq[i]; - bsw2seq1_t *p = &_seq->seq[i]; - p->tid = -1; p->l = b->l_seq; - p->name = b->name; p->seq = b->seq; p->qual = b->qual; p->comment = b->comment; p->sam = 0; - size += p->l; - } - fprintf(stderr, "[bsw2_aln] read %d sequences/pairs (%d bp) ...\n", n, size); - free(bseq); - process_seqs(_seq, opt, bns, pac, target, is_pe); - } - // free - free(pac); - free(_seq->seq); free(_seq); - kseq_destroy(ks); - err_gzclose(fp); - if (fn2) { - kseq_destroy(ks2); - err_gzclose(fp2); - } -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtsw2_chain.c --- a/bwa-0.7.9a/bwtsw2_chain.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,112 +0,0 @@ -#include -#include "bwtsw2.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -typedef struct { - uint32_t tbeg, tend; - int qbeg, qend; - uint32_t flag:1, idx:31; - int chain; // also reuse as a counter -} hsaip_t; - -#define _hsaip_lt(a, b) ((a).qbeg < (b).qbeg) - -#include "ksort.h" -KSORT_INIT(hsaip, hsaip_t, _hsaip_lt) - -static int chaining(const bsw2opt_t *opt, int shift, int n, hsaip_t *z, hsaip_t *chain) -{ - int j, k, m = 0; - ks_introsort(hsaip, n, z); - for (j = 0; j < n; ++j) { - hsaip_t *p = z + j; - for (k = m - 1; k >= 0; --k) { - hsaip_t *q = chain + k; - int x = p->qbeg - q->qbeg; // always positive - int y = p->tbeg - q->tbeg; - if (y > 0 && x < opt->max_chain_gap && y < opt->max_chain_gap && x - y <= opt->bw && y - x <= opt->bw) { // chained - if (p->qend > q->qend) q->qend = p->qend; - if (p->tend > q->tend) q->tend = p->tend; - ++q->chain; - p->chain = shift + k; - break; - } else if (q->chain > opt->t_seeds * 2) k = 0; // if the chain is strong enough, do not check the previous chains - } - if (k < 0) { // not added to any previous chains - chain[m] = *p; - chain[m].chain = 1; - chain[m].idx = p->chain = shift + m; - ++m; - } - } - return m; -} - -void bsw2_chain_filter(const bsw2opt_t *opt, int len, bwtsw2_t *b[2]) -{ - hsaip_t *z[2], *chain[2]; - int i, j, k, n[2], m[2], thres = opt->t_seeds * 2; - char *flag; - // initialization - n[0] = b[0]->n; n[1] = b[1]->n; - z[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); - z[1] = z[0] + n[0]; - chain[0] = calloc(n[0] + n[1], sizeof(hsaip_t)); - for (k = j = 0; k < 2; ++k) { - for (i = 0; i < b[k]->n; ++i) { - bsw2hit_t *p = b[k]->hits + i; - hsaip_t *q = z[k] + i; - q->flag = k; q->idx = i; - q->tbeg = p->k; q->tend = p->k + p->len; - q->chain = -1; - q->qbeg = p->beg; q->qend = p->end; - } - } - // chaining - m[0] = chaining(opt, 0, n[0], z[0], chain[0]); - chain[1] = chain[0] + m[0]; - m[1] = chaining(opt, m[0], n[1], z[1], chain[1]); - // change query coordinate on the reverse strand - for (k = 0; k < m[1]; ++k) { - hsaip_t *p = chain[1] + k; - int tmp = p->qbeg; - p->qbeg = len - p->qend; p->qend = len - tmp; - } - //for (k = 0; k < m[0]; ++k) printf("%d, [%d,%d), [%d,%d)\n", chain[0][k].chain, chain[0][k].tbeg, chain[0][k].tend, chain[0][k].qbeg, chain[0][k].qend); - // filtering - flag = calloc(m[0] + m[1], 1); - ks_introsort(hsaip, m[0] + m[1], chain[0]); - for (k = 1; k < m[0] + m[1]; ++k) { - hsaip_t *p = chain[0] + k; - for (j = 0; j < k; ++j) { - hsaip_t *q = chain[0] + j; - if (flag[q->idx]) continue; - if (q->qend >= p->qend && q->chain > p->chain * thres && p->chain < thres) { - flag[p->idx] = 1; - break; - } - } - } - for (k = 0; k < n[0] + n[1]; ++k) { - hsaip_t *p = z[0] + k; - if (flag[p->chain]) - b[p->flag]->hits[p->idx].G = 0; - } - free(flag); - // squeeze out filtered elements in b[2] - for (k = 0; k < 2; ++k) { - for (j = i = 0; j < n[k]; ++j) { - bsw2hit_t *p = b[k]->hits + j; - if (p->G) { - if (i != j) b[k]->hits[i++] = *p; - else ++i; - } - } - b[k]->n = i; - } - // free - free(z[0]); free(chain[0]); -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtsw2_core.c --- a/bwa-0.7.9a/bwtsw2_core.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,619 +0,0 @@ -#include -#include -#include -#include -#include -#include "bwt_lite.h" -#include "bwtsw2.h" -#include "bwt.h" -#include "kvec.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -typedef struct { - bwtint_t k, l; -} qintv_t; - -#define qintv_eq(a, b) ((a).k == (b).k && (a).l == (b).l) -#define qintv_hash(a) ((a).k>>7^(a).l<<17) - -#include "khash.h" -KHASH_INIT(qintv, qintv_t, uint64_t, 1, qintv_hash, qintv_eq) -KHASH_MAP_INIT_INT64(64, uint64_t) - -#define MINUS_INF -0x3fffffff -#define MASK_LEVEL 0.90f - -struct __mempool_t; -static void mp_destroy(struct __mempool_t*); -typedef struct { - bwtint_t qk, ql; - int I, D, G; - uint32_t pj:2, qlen:30; - int tlen; - int ppos, upos; - int cpos[4]; -} bsw2cell_t; - -#include "ksort.h" -KSORT_INIT_GENERIC(int) -#define __hitG_lt(a, b) (((a).G + ((int)(a).n_seeds<<2)) > (b).G + ((int)(b).n_seeds<<2)) -KSORT_INIT(hitG, bsw2hit_t, __hitG_lt) - -static const bsw2cell_t g_default_cell = { 0, 0, MINUS_INF, MINUS_INF, MINUS_INF, 0, 0, 0, -1, -1, {-1, -1, -1, -1} }; - -typedef struct { - int n, max; - uint32_t tk, tl; // this is fine - bsw2cell_t *array; -} bsw2entry_t, *bsw2entry_p; - -/* --- BEGIN: Stack operations --- */ -typedef struct { - int n_pending; - kvec_t(bsw2entry_p) stack0, pending; - struct __mempool_t *pool; -} bsw2stack_t; - -#define stack_isempty(s) (kv_size(s->stack0) == 0 && s->n_pending == 0) -static void stack_destroy(bsw2stack_t *s) { mp_destroy(s->pool); kv_destroy(s->stack0); kv_destroy(s->pending); free(s); } -inline static void stack_push0(bsw2stack_t *s, bsw2entry_p e) { kv_push(bsw2entry_p, s->stack0, e); } -inline static bsw2entry_p stack_pop(bsw2stack_t *s) -{ - assert(!(kv_size(s->stack0) == 0 && s->n_pending != 0)); - return kv_pop(s->stack0); -} -/* --- END: Stack operations --- */ - -/* --- BEGIN: memory pool --- */ -typedef struct __mempool_t { - int cnt; // if cnt!=0, then there must be memory leak - kvec_t(bsw2entry_p) pool; -} mempool_t; -inline static bsw2entry_p mp_alloc(mempool_t *mp) -{ - ++mp->cnt; - if (kv_size(mp->pool) == 0) return (bsw2entry_t*)calloc(1, sizeof(bsw2entry_t)); - else return kv_pop(mp->pool); -} -inline static void mp_free(mempool_t *mp, bsw2entry_p e) -{ - --mp->cnt; e->n = 0; - kv_push(bsw2entry_p, mp->pool, e); -} -static void mp_destroy(struct __mempool_t *mp) -{ - int i; - for (i = 0; i != kv_size(mp->pool); ++i) { - free(kv_A(mp->pool, i)->array); - free(kv_A(mp->pool, i)); - } - kv_destroy(mp->pool); - free(mp); -} -/* --- END: memory pool --- */ - -/* --- BEGIN: utilities --- */ -static khash_t(64) *bsw2_connectivity(const bwtl_t *b) -{ - khash_t(64) *h; - uint32_t k, l, cntk[4], cntl[4]; // this is fine - uint64_t x; - khiter_t iter; - int j, ret; - kvec_t(uint64_t) stack; - - kv_init(stack); - h = kh_init(64); - kh_resize(64, h, b->seq_len * 4); - x = b->seq_len; - kv_push(uint64_t, stack, x); - while (kv_size(stack)) { - x = kv_pop(stack); - k = x>>32; l = (uint32_t)x; - bwtl_2occ4(b, k-1, l, cntk, cntl); - for (j = 0; j != 4; ++j) { - k = b->L2[j] + cntk[j] + 1; - l = b->L2[j] + cntl[j]; - if (k > l) continue; - x = (uint64_t)k << 32 | l; - iter = kh_put(64, h, x, &ret); - if (ret) { // if not present - kh_value(h, iter) = 1; - kv_push(uint64_t, stack, x); - } else ++kh_value(h, iter); - } - } - kv_destroy(stack); - //fprintf(stderr, "[bsw2_connectivity] %u nodes in the DAG\n", kh_size(h)); - return h; -} -// pick up top T matches at a node -static void cut_tail(bsw2entry_t *u, int T, bsw2entry_t *aux) -{ - int i, *a, n, x; - if (u->n <= T) return; - if (aux->max < u->n) { - aux->max = u->n; - aux->array = (bsw2cell_t*)realloc(aux->array, aux->max * sizeof(bsw2cell_t)); - } - a = (int*)aux->array; - for (i = n = 0; i != u->n; ++i) - if (u->array[i].ql && u->array[i].G > 0) - a[n++] = -u->array[i].G; - if (n <= T) return; - x = -ks_ksmall(int, n, a, T); - n = 0; - for (i = 0; i < u->n; ++i) { - bsw2cell_t *p = u->array + i; - if (p->G == x) ++n; - if (p->G < x || (p->G == x && n >= T)) { - p->qk = p->ql = 0; p->G = 0; - if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -1; - } - } -} -// remove duplicated cells -static inline void remove_duplicate(bsw2entry_t *u, khash_t(qintv) *hash) -{ - int i, ret, j; - khiter_t k; - qintv_t key; - kh_clear(qintv, hash); - for (i = 0; i != u->n; ++i) { - bsw2cell_t *p = u->array + i; - if (p->ql == 0) continue; - key.k = p->qk; key.l = p->ql; - k = kh_put(qintv, hash, key, &ret); - j = -1; - if (ret == 0) { - if ((uint32_t)kh_value(hash, k) >= p->G) j = i; - else { - j = kh_value(hash, k)>>32; - kh_value(hash, k) = (uint64_t)i<<32 | p->G; - } - } else kh_value(hash, k) = (uint64_t)i<<32 | p->G; - if (j >= 0) { - p = u->array + j; - p->qk = p->ql = 0; p->G = 0; - if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; - } - } -} -// merge two entries -static void merge_entry(const bsw2opt_t * __restrict opt, bsw2entry_t *u, bsw2entry_t *v, bwtsw2_t *b) -{ - int i; - if (u->n + v->n >= u->max) { - u->max = u->n + v->n; - u->array = (bsw2cell_t*)realloc(u->array, u->max * sizeof(bsw2cell_t)); - } - for (i = 0; i != v->n; ++i) { - bsw2cell_t *p = v->array + i; - if (p->ppos >= 0) p->ppos += u->n; - if (p->cpos[0] >= 0) p->cpos[0] += u->n; - if (p->cpos[1] >= 0) p->cpos[1] += u->n; - if (p->cpos[2] >= 0) p->cpos[2] += u->n; - if (p->cpos[3] >= 0) p->cpos[3] += u->n; - } - memcpy(u->array + u->n, v->array, v->n * sizeof(bsw2cell_t)); - u->n += v->n; -} - -static inline bsw2cell_t *push_array_p(bsw2entry_t *e) -{ - if (e->n == e->max) { - e->max = e->max? e->max<<1 : 256; - e->array = (bsw2cell_t*)realloc(e->array, sizeof(bsw2cell_t) * e->max); - } - return e->array + e->n; -} - -static inline double time_elapse(const struct rusage *curr, const struct rusage *last) -{ - long t1 = (curr->ru_utime.tv_sec - last->ru_utime.tv_sec) + (curr->ru_stime.tv_sec - last->ru_stime.tv_sec); - long t2 = (curr->ru_utime.tv_usec - last->ru_utime.tv_usec) + (curr->ru_stime.tv_usec - last->ru_stime.tv_usec); - return (double)t1 + t2 * 1e-6; -} -/* --- END: utilities --- */ - -/* --- BEGIN: processing partial hits --- */ -static void save_hits(const bwtl_t *bwt, int thres, bsw2hit_t *hits, bsw2entry_t *u) -{ - int i; - uint32_t k; // this is fine - for (i = 0; i < u->n; ++i) { - bsw2cell_t *p = u->array + i; - if (p->G < thres) continue; - for (k = u->tk; k <= u->tl; ++k) { - int beg, end; - bsw2hit_t *q = 0; - beg = bwt->sa[k]; end = beg + p->tlen; - if (p->G > hits[beg*2].G) { - hits[beg*2+1] = hits[beg*2]; - q = hits + beg * 2; - } else if (p->G > hits[beg*2+1].G) q = hits + beg * 2 + 1; - if (q) { - q->k = p->qk; q->l = p->ql; q->len = p->qlen; q->G = p->G; - q->beg = beg; q->end = end; q->G2 = q->k == q->l? 0 : q->G; - q->flag = q->n_seeds = 0; - } - } - } -} -/* "narrow hits" are node-to-node hits that have a high score and - * are not so repetitive (|SA interval|<=IS). */ -static void save_narrow_hits(const bwtl_t *bwtl, bsw2entry_t *u, bwtsw2_t *b1, int t, int IS) -{ - int i; - for (i = 0; i < u->n; ++i) { - bsw2hit_t *q; - bsw2cell_t *p = u->array + i; - if (p->G >= t && p->ql - p->qk + 1 <= IS) { // good narrow hit - if (b1->max == b1->n) { - b1->max = b1->max? b1->max<<1 : 4; - b1->hits = realloc(b1->hits, b1->max * sizeof(bsw2hit_t)); - } - q = &b1->hits[b1->n++]; - q->k = p->qk; q->l = p->ql; - q->len = p->qlen; - q->G = p->G; q->G2 = 0; - q->beg = bwtl->sa[u->tk]; q->end = q->beg + p->tlen; - q->flag = 0; - // delete p - p->qk = p->ql = 0; p->G = 0; - if (p->ppos >= 0) u->array[p->ppos].cpos[p->pj] = -3; - } - } -} -/* after this, "narrow SA hits" will be expanded and the coordinates - * will be obtained and stored in b->hits[*].k. */ -int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS) -{ - int i, j, n, is_rev; - if (b->n == 0) return 0; - if (bwt && bns) { // convert to chromosomal coordinates if requested - int old_n = b->n; - bsw2hit_t *old_hits = b->hits; - for (i = n = 0; i < b->n; ++i) { // compute the memory to allocated - bsw2hit_t *p = old_hits + i; - if (p->l - p->k + 1 <= IS) n += p->l - p->k + 1; - else if (p->G > 0) ++n; - } - b->n = b->max = n; - b->hits = calloc(b->max, sizeof(bsw2hit_t)); - for (i = j = 0; i < old_n; ++i) { - bsw2hit_t *p = old_hits + i; - if (p->l - p->k + 1 <= IS) { // the hit is no so repetitive - bwtint_t k; - if (p->G == 0 && p->k == 0 && p->l == 0 && p->len == 0) continue; - for (k = p->k; k <= p->l; ++k) { - b->hits[j] = *p; - b->hits[j].k = bns_depos(bns, bwt_sa(bwt, k), &is_rev); - b->hits[j].l = 0; - b->hits[j].is_rev = is_rev; - if (is_rev) b->hits[j].k -= p->len - 1; - ++j; - } - } else if (p->G > 0) { - b->hits[j] = *p; - b->hits[j].k = bns_depos(bns, bwt_sa(bwt, p->k), &is_rev); - b->hits[j].l = 0; - b->hits[j].flag |= 1; - b->hits[j].is_rev = is_rev; - if (is_rev) b->hits[j].k -= p->len - 1; - ++j; - } - } - free(old_hits); - } - for (i = j = 0; i < b->n; ++i) // squeeze out empty elements - if (b->hits[i].G) b->hits[j++] = b->hits[i]; - b->n = j; - ks_introsort(hitG, b->n, b->hits); - for (i = 1; i < b->n; ++i) { - bsw2hit_t *p = b->hits + i; - for (j = 0; j < i; ++j) { - bsw2hit_t *q = b->hits + j; - int compatible = 1; - if (p->is_rev != q->is_rev) continue; // hits from opposite strands are not duplicates - if (p->l == 0 && q->l == 0) { - int qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); // length of query overlap - if (qol < 0) qol = 0; - if ((float)qol / (p->end - p->beg) > MASK_LEVEL || (float)qol / (q->end - q->beg) > MASK_LEVEL) { - int64_t tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) - - (int64_t)(p->k > q->k? p->k : q->k); // length of target overlap - if ((double)tol / p->len > MASK_LEVEL || (double)tol / q->len > MASK_LEVEL) - compatible = 0; - } - } - if (!compatible) { - p->G = 0; - if (q->G2 < p->G2) q->G2 = p->G2; - break; - } - } - } - n = i; - for (i = j = 0; i < n; ++i) { - if (b->hits[i].G == 0) continue; - if (i != j) b->hits[j++] = b->hits[i]; - else ++j; - } - b->n = j; - return b->n; -} - -int bsw2_resolve_query_overlaps(bwtsw2_t *b, float mask_level) -{ - int i, j, n; - if (b->n == 0) return 0; - ks_introsort(hitG, b->n, b->hits); - { // choose a random one - int G0 = b->hits[0].G; - for (i = 1; i < b->n; ++i) - if (b->hits[i].G != G0) break; - j = (int)(i * drand48()); - if (j) { - bsw2hit_t tmp; - tmp = b->hits[0]; b->hits[0] = b->hits[j]; b->hits[j] = tmp; - } - } - for (i = 1; i < b->n; ++i) { - bsw2hit_t *p = b->hits + i; - int all_compatible = 1; - if (p->G == 0) break; - for (j = 0; j < i; ++j) { - bsw2hit_t *q = b->hits + j; - int64_t tol = 0; - int qol, compatible = 0; - float fol; - if (q->G == 0) continue; - qol = (p->end < q->end? p->end : q->end) - (p->beg > q->beg? p->beg : q->beg); - if (qol < 0) qol = 0; - if (p->l == 0 && q->l == 0) { - tol = (int64_t)(p->k + p->len < q->k + q->len? p->k + p->len : q->k + q->len) - - (p->k > q->k? p->k : q->k); - if (tol < 0) tol = 0; - } - fol = (float)qol / (p->end - p->beg < q->end - q->beg? p->end - p->beg : q->end - q->beg); - if (fol < mask_level || (tol > 0 && qol < p->end - p->beg && qol < q->end - q->beg)) compatible = 1; - if (!compatible) { - if (q->G2 < p->G) q->G2 = p->G; - all_compatible = 0; - } - } - if (!all_compatible) p->G = 0; - } - n = i; - for (i = j = 0; i < n; ++i) { - if (b->hits[i].G == 0) continue; - if (i != j) b->hits[j++] = b->hits[i]; - else ++j; - } - b->n = j; - return j; -} -/* --- END: processing partial hits --- */ - -/* --- BEGIN: global mem pool --- */ -bsw2global_t *bsw2_global_init() -{ - bsw2global_t *pool; - bsw2stack_t *stack; - pool = calloc(1, sizeof(bsw2global_t)); - stack = calloc(1, sizeof(bsw2stack_t)); - stack->pool = (mempool_t*)calloc(1, sizeof(mempool_t)); - pool->stack = (void*)stack; - return pool; -} - -void bsw2_global_destroy(bsw2global_t *pool) -{ - stack_destroy((bsw2stack_t*)pool->stack); - free(pool->aln_mem); - free(pool); -} -/* --- END: global mem pool --- */ - -static inline int fill_cell(const bsw2opt_t *o, int match_score, bsw2cell_t *c[4]) -{ - int G = c[3]? c[3]->G + match_score : MINUS_INF; - if (c[1]) { - c[0]->I = c[1]->I > c[1]->G - o->q? c[1]->I - o->r : c[1]->G - o->qr; - if (c[0]->I > G) G = c[0]->I; - } else c[0]->I = MINUS_INF; - if (c[2]) { - c[0]->D = c[2]->D > c[2]->G - o->q? c[2]->D - o->r : c[2]->G - o->qr; - if (c[0]->D > G) G = c[0]->D; - } else c[0]->D = MINUS_INF; - return(c[0]->G = G); -} - -static void init_bwtsw2(const bwtl_t *target, const bwt_t *query, bsw2stack_t *s) -{ - bsw2entry_t *u; - bsw2cell_t *x; - - u = mp_alloc(s->pool); - u->tk = 0; u->tl = target->seq_len; - x = push_array_p(u); - *x = g_default_cell; - x->G = 0; x->qk = 0; x->ql = query->seq_len; - u->n++; - stack_push0(s, u); -} -/* On return, ret[1] keeps not-so-repetitive hits (narrow SA hits); ret[0] keeps all hits (right?) */ -bwtsw2_t **bsw2_core(const bntseq_t *bns, const bsw2opt_t *opt, const bwtl_t *target, const bwt_t *query, bsw2global_t *pool) -{ - bsw2stack_t *stack = (bsw2stack_t*)pool->stack; - bwtsw2_t *b, *b1, **b_ret; - int i, j, score_mat[16], *heap, heap_size, n_tot = 0; - struct rusage curr, last; - khash_t(qintv) *rhash; - khash_t(64) *chash; - - // initialize connectivity hash (chash) - chash = bsw2_connectivity(target); - // calculate score matrix - for (i = 0; i != 4; ++i) - for (j = 0; j != 4; ++j) - score_mat[i<<2|j] = (i == j)? opt->a : -opt->b; - // initialize other variables - rhash = kh_init(qintv); - init_bwtsw2(target, query, stack); - heap_size = opt->z; - heap = calloc(heap_size, sizeof(int)); - // initialize the return struct - b = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); - b->n = b->max = target->seq_len * 2; - b->hits = calloc(b->max, sizeof(bsw2hit_t)); - b1 = (bwtsw2_t*)calloc(1, sizeof(bwtsw2_t)); - b_ret = calloc(2, sizeof(void*)); - b_ret[0] = b; b_ret[1] = b1; - // initialize timer - getrusage(0, &last); - // the main loop: traversal of the DAG - while (!stack_isempty(stack)) { - int old_n, tj; - bsw2entry_t *v; - uint32_t tcntk[4], tcntl[4]; - bwtint_t k, l; - - v = stack_pop(stack); old_n = v->n; - n_tot += v->n; - - for (i = 0; i < v->n; ++i) { // test max depth and band width - bsw2cell_t *p = v->array + i; - if (p->ql == 0) continue; - if (p->tlen - (int)p->qlen > opt->bw || (int)p->qlen - p->tlen > opt->bw) { - p->qk = p->ql = 0; - if (p->ppos >= 0) v->array[p->ppos].cpos[p->pj] = -5; - } - } - - // get Occ for the DAG - bwtl_2occ4(target, v->tk - 1, v->tl, tcntk, tcntl); - for (tj = 0; tj != 4; ++tj) { // descend to the children - bwtint_t qcntk[4], qcntl[4]; - int qj, *curr_score_mat = score_mat + tj * 4; - khiter_t iter; - bsw2entry_t *u; - - k = target->L2[tj] + tcntk[tj] + 1; - l = target->L2[tj] + tcntl[tj]; - if (k > l) continue; - // update counter - iter = kh_get(64, chash, (uint64_t)k<<32 | l); - --kh_value(chash, iter); - // initialization - u = mp_alloc(stack->pool); - u->tk = k; u->tl = l; - memset(heap, 0, sizeof(int) * opt->z); - // loop through all the nodes in v - for (i = 0; i < v->n; ++i) { - bsw2cell_t *p = v->array + i, *x, *c[4]; // c[0]=>current, c[1]=>I, c[2]=>D, c[3]=>G - int is_added = 0; - if (p->ql == 0) continue; // deleted node - c[0] = x = push_array_p(u); - x->G = MINUS_INF; - p->upos = x->upos = -1; - if (p->ppos >= 0) { // parent has been visited - c[1] = (v->array[p->ppos].upos >= 0)? u->array + v->array[p->ppos].upos : 0; - c[3] = v->array + p->ppos; c[2] = p; - if (fill_cell(opt, curr_score_mat[p->pj], c) > 0) { // then update topology at p and x - x->ppos = v->array[p->ppos].upos; // the parent pos in u - p->upos = u->n++; // the current pos in u - if (x->ppos >= 0) u->array[x->ppos].cpos[p->pj] = p->upos; // the child pos of its parent in u - is_added = 1; - } - } else { - x->D = p->D > p->G - opt->q? p->D - opt->r : p->G - opt->qr; - if (x->D > 0) { - x->G = x->D; - x->I = MINUS_INF; x->ppos = -1; - p->upos = u->n++; - is_added = 1; - } - } - if (is_added) { // x has been added to u->array. fill the remaining variables - x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; - x->pj = p->pj; x->qk = p->qk; x->ql = p->ql; x->qlen = p->qlen; x->tlen = p->tlen + 1; - if (x->G > -heap[0]) { - heap[0] = -x->G; - ks_heapadjust(int, 0, heap_size, heap); - } - } - if ((x->G > opt->qr && x->G >= -heap[0]) || i < old_n) { // good node in u, or in v - if (p->cpos[0] == -1 || p->cpos[1] == -1 || p->cpos[2] == -1 || p->cpos[3] == -1) { - bwt_2occ4(query, p->qk - 1, p->ql, qcntk, qcntl); - for (qj = 0; qj != 4; ++qj) { // descend to the prefix trie - if (p->cpos[qj] != -1) continue; // this node will be visited later - k = query->L2[qj] + qcntk[qj] + 1; - l = query->L2[qj] + qcntl[qj]; - if (k > l) { p->cpos[qj] = -2; continue; } - x = push_array_p(v); - p = v->array + i; // p may not point to the correct position after realloc - x->G = x->I = x->D = MINUS_INF; - x->qk = k; x->ql = l; x->pj = qj; x->qlen = p->qlen + 1; x->ppos = i; x->tlen = p->tlen; - x->cpos[0] = x->cpos[1] = x->cpos[2] = x->cpos[3] = -1; - p->cpos[qj] = v->n++; - } // ~for(qj) - } // ~if(p->cpos[]) - } // ~if - } // ~for(i) - if (u->n) save_hits(target, opt->t, b->hits, u); - { // push u to the stack (or to the pending array) - uint32_t cnt, pos; - cnt = (uint32_t)kh_value(chash, iter); - pos = kh_value(chash, iter)>>32; - if (pos) { // something in the pending array, then merge - bsw2entry_t *w = kv_A(stack->pending, pos-1); - if (u->n) { - if (w->n < u->n) { // swap - w = u; u = kv_A(stack->pending, pos-1); kv_A(stack->pending, pos-1) = w; - } - merge_entry(opt, w, u, b); - } - if (cnt == 0) { // move from pending to stack0 - remove_duplicate(w, rhash); - save_narrow_hits(target, w, b1, opt->t, opt->is); - cut_tail(w, opt->z, u); - stack_push0(stack, w); - kv_A(stack->pending, pos-1) = 0; - --stack->n_pending; - } - mp_free(stack->pool, u); - } else if (cnt) { // the first time - if (u->n) { // push to the pending queue - ++stack->n_pending; - kv_push(bsw2entry_p, stack->pending, u); - kh_value(chash, iter) = (uint64_t)kv_size(stack->pending)<<32 | cnt; - } else mp_free(stack->pool, u); - } else { // cnt == 0, then push to the stack - bsw2entry_t *w = mp_alloc(stack->pool); - save_narrow_hits(target, u, b1, opt->t, opt->is); - cut_tail(u, opt->z, w); - mp_free(stack->pool, w); - stack_push0(stack, u); - } - } - } // ~for(tj) - mp_free(stack->pool, v); - } // while(top) - getrusage(0, &curr); - for (i = 0; i < 2; ++i) - for (j = 0; j < b_ret[i]->n; ++j) - b_ret[i]->hits[j].n_seeds = 0; - bsw2_resolve_duphits(bns, query, b, opt->is); - bsw2_resolve_duphits(bns, query, b1, opt->is); - //fprintf(stderr, "stats: %.3lf sec; %d elems\n", time_elapse(&curr, &last), n_tot); - // free - free(heap); - kh_destroy(qintv, rhash); - kh_destroy(64, chash); - stack->pending.n = stack->stack0.n = 0; - return b_ret; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtsw2_main.c --- a/bwa-0.7.9a/bwtsw2_main.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,89 +0,0 @@ -#include -#include -#include -#include -#include -#include "bwt.h" -#include "bwtsw2.h" -#include "utils.h" -#include "bwa.h" - -int bwa_bwtsw2(int argc, char *argv[]) -{ - bsw2opt_t *opt; - bwaidx_t *idx; - int c; - - opt = bsw2_init_opt(); - srand48(11); - while ((c = getopt(argc, argv, "q:r:a:b:t:T:w:d:z:m:s:c:N:Hf:MI:SG:C")) >= 0) { - switch (c) { - case 'q': opt->q = atoi(optarg); break; - case 'r': opt->r = atoi(optarg); break; - case 'a': opt->a = atoi(optarg); break; - case 'b': opt->b = atoi(optarg); break; - case 'w': opt->bw = atoi(optarg); break; - case 'T': opt->t = atoi(optarg); break; - case 't': opt->n_threads = atoi(optarg); break; - case 'z': opt->z = atoi(optarg); break; - case 's': opt->is = atoi(optarg); break; - case 'm': opt->mask_level = atof(optarg); break; - case 'c': opt->coef = atof(optarg); break; - case 'N': opt->t_seeds = atoi(optarg); break; - case 'M': opt->multi_2nd = 1; break; - case 'H': opt->hard_clip = 1; break; - case 'f': xreopen(optarg, "w", stdout); break; - case 'I': opt->max_ins = atoi(optarg); break; - case 'S': opt->skip_sw = 1; break; - case 'C': opt->cpy_cmt = 1; break; - case 'G': opt->max_chain_gap = atoi(optarg); break; - default: return 1; - } - } - opt->qr = opt->q + opt->r; - - if (optind + 2 > argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa bwasw [options] [query2.fa]\n\n"); - fprintf(stderr, "Options: -a INT score for a match [%d]\n", opt->a); - fprintf(stderr, " -b INT mismatch penalty [%d]\n", opt->b); - fprintf(stderr, " -q INT gap open penalty [%d]\n", opt->q); - fprintf(stderr, " -r INT gap extension penalty [%d]\n", opt->r); - fprintf(stderr, " -w INT band width [%d]\n", opt->bw); - fprintf(stderr, " -m FLOAT mask level [%.2f]\n", opt->mask_level); - fprintf(stderr, "\n"); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -f FILE file to output results to instead of stdout\n"); - fprintf(stderr, " -H in SAM output, use hard clipping instead of soft clipping\n"); - fprintf(stderr, " -C copy FASTA/Q comment to SAM output\n"); - fprintf(stderr, " -M mark multi-part alignments as secondary\n"); - fprintf(stderr, " -S skip Smith-Waterman read pairing\n"); - fprintf(stderr, " -I INT ignore pairs with insert >=INT for inferring the size distr [%d]\n", opt->max_ins); - fprintf(stderr, "\n"); - fprintf(stderr, " -T INT score threshold divided by a [%d]\n", opt->t); - fprintf(stderr, " -c FLOAT coefficient of length-threshold adjustment [%.1f]\n", opt->coef); - fprintf(stderr, " -z INT Z-best [%d]\n", opt->z); - fprintf(stderr, " -s INT maximum seeding interval size [%d]\n", opt->is); - fprintf(stderr, " -N INT # seeds to trigger rev aln; 2*INT is also the chaining threshold [%d]\n", opt->t_seeds); - fprintf(stderr, " -G INT maximum gap size during chaining [%d]\n", opt->max_chain_gap); - fprintf(stderr, "\n"); - fprintf(stderr, "Note: For long Illumina, 454 and Sanger reads, assembly contigs, fosmids and\n"); - fprintf(stderr, " BACs, the default setting usually works well. For the current PacBio\n"); - fprintf(stderr, " reads (end of 2010), '-b5 -q2 -r1 -z10' is recommended. One may also\n"); - fprintf(stderr, " increase '-z' for better sensitivity.\n"); - fprintf(stderr, "\n"); - - return 1; - } - - // adjust opt for opt->a - opt->t *= opt->a; - opt->coef *= opt->a; - - if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; - bsw2_aln(opt, idx->bns, idx->bwt, argv[optind+1], optind+2 < argc? argv[optind+2] : 0); - bwa_idx_destroy(idx); - free(opt); - - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/bwtsw2_pair.c --- a/bwa-0.7.9a/bwtsw2_pair.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,268 +0,0 @@ -#include -#include -#include -#include -#include "utils.h" -#include "bwt.h" -#include "bntseq.h" -#include "bwtsw2.h" -#include "kstring.h" -#include "ksw.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#define MIN_RATIO 0.8 -#define OUTLIER_BOUND 2.0 -#define MAX_STDDEV 4.0 -#define EXT_STDDEV 4.0 - -typedef struct { - int low, high, failed; - double avg, std; -} bsw2pestat_t; - -bsw2pestat_t bsw2_stat(int n, bwtsw2_t **buf, kstring_t *msg, int max_ins) -{ - int i, k, x, p25, p50, p75, tmp, max_len = 0; - uint64_t *isize; - bsw2pestat_t r; - - memset(&r, 0, sizeof(bsw2pestat_t)); - isize = calloc(n, 8); - for (i = k = 0; i < n; i += 2) { - bsw2hit_t *t[2]; - int l; - if (buf[i] == 0 || buf[i]->n != 1 || buf[i+1]->n != 1) continue; // more than 1 hits - t[0] = &buf[i]->hits[0]; t[1] = &buf[i+1]->hits[0]; - if (t[0]->G2 > 0.8 * t[0]->G) continue; // the best hit is not good enough - if (t[1]->G2 > 0.8 * t[1]->G) continue; // the best hit is not good enough - l = t[0]->k > t[1]->k? t[0]->k - t[1]->k + t[1]->len : t[1]->k - t[0]->k + t[0]->len; - if (l >= max_ins) continue; // skip pairs with excessively large insert - max_len = max_len > t[0]->end - t[0]->beg? max_len : t[0]->end - t[0]->beg; - max_len = max_len > t[1]->end - t[1]->beg? max_len : t[1]->end - t[1]->beg; - isize[k++] = l; - } - ks_introsort_64(k, isize); - p25 = isize[(int)(.25 * k + .499)]; - p50 = isize[(int)(.50 * k + .499)]; - p75 = isize[(int)(.75 * k + .499)]; - ksprintf(msg, "[%s] infer the insert size distribution from %d high-quality pairs.\n", __func__, k); - if (k < 8) { - ksprintf(msg, "[%s] fail to infer the insert size distribution: too few good pairs.\n", __func__); - free(isize); - r.failed = 1; - return r; - } - tmp = (int)(p25 - OUTLIER_BOUND * (p75 - p25) + .499); - r.low = tmp > max_len? tmp : max_len; - if (r.low < 1) r.low = 1; - r.high = (int)(p75 + OUTLIER_BOUND * (p75 - p25) + .499); - if (r.low > r.high) { - ksprintf(msg, "[%s] fail to infer the insert size distribution: upper bound is smaller than max read length.\n", __func__); - free(isize); - r.failed = 1; - return r; - } - ksprintf(msg, "[%s] (25, 50, 75) percentile: (%d, %d, %d)\n", __func__, p25, p50, p75); - ksprintf(msg, "[%s] low and high boundaries for computing mean and std.dev: (%d, %d)\n", __func__, r.low, r.high); - for (i = x = 0, r.avg = 0; i < k; ++i) - if (isize[i] >= r.low && isize[i] <= r.high) - r.avg += isize[i], ++x; - r.avg /= x; - for (i = 0, r.std = 0; i < k; ++i) - if (isize[i] >= r.low && isize[i] <= r.high) - r.std += (isize[i] - r.avg) * (isize[i] - r.avg); - r.std = sqrt(r.std / x); - ksprintf(msg, "[%s] mean and std.dev: (%.2f, %.2f)\n", __func__, r.avg, r.std); - tmp = (int)(p25 - 3. * (p75 - p25) + .499); - r.low = tmp > max_len? tmp : max_len; - if (r.low < 1) r.low = 1; - r.high = (int)(p75 + 3. * (p75 - p25) + .499); - if (r.low > r.avg - MAX_STDDEV * r.std) r.low = (int)(r.avg - MAX_STDDEV * r.std + .499); - r.low = tmp > max_len? tmp : max_len; - if (r.high < r.avg - MAX_STDDEV * r.std) r.high = (int)(r.avg + MAX_STDDEV * r.std + .499); - ksprintf(msg, "[%s] low and high boundaries for proper pairs: (%d, %d)\n", __func__, r.low, r.high); - free(isize); - return r; -} - -typedef struct { - int n_cigar, beg, end, len; - int64_t pos; - uint32_t *cigar; -} pairaux_t; - -extern unsigned char nst_nt4_table[256]; - -void bsw2_pair1(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, const bsw2pestat_t *st, const bsw2hit_t *h, int l_mseq, const char *mseq, bsw2hit_t *a, int8_t g_mat[25]) -{ - extern void seq_reverse(int len, ubyte_t *seq, int is_comp); - int64_t k, beg, end; - uint8_t *seq, *ref; - int i; - // compute the region start and end - a->n_seeds = 1; a->flag |= BSW2_FLAG_MATESW; // before calling this routine, *a has been cleared with memset(0); the flag is set with 1<<6/7 - if (h->is_rev == 0) { - beg = (int64_t)(h->k + st->avg - EXT_STDDEV * st->std - l_mseq + .499); - if (beg < h->k) beg = h->k; - end = (int64_t)(h->k + st->avg + EXT_STDDEV * st->std + .499); - a->is_rev = 1; a->flag |= 16; - } else { - beg = (int64_t)(h->k + h->end - h->beg - st->avg - EXT_STDDEV * st->std + .499); - end = (int64_t)(h->k + h->end - h->beg - st->avg + EXT_STDDEV * st->std + l_mseq + .499); - if (end > h->k + (h->end - h->beg)) end = h->k + (h->end - h->beg); - a->is_rev = 0; - } - if (beg < 1) beg = 1; - if (end > l_pac) end = l_pac; - if (end - beg < l_mseq) return; - // generate the sequence - seq = malloc(l_mseq + (end - beg)); - ref = seq + l_mseq; - for (k = beg; k < end; ++k) - ref[k - beg] = pac[k>>2] >> ((~k&3)<<1) & 0x3; - if (h->is_rev == 0) { - for (i = 0; i < l_mseq; ++i) { // on the reverse strand - int c = nst_nt4_table[(int)mseq[i]]; - seq[l_mseq - 1 - i] = c > 3? 4 : 3 - c; - } - } else { - for (i = 0; i < l_mseq; ++i) // on the forward strand - seq[i] = nst_nt4_table[(int)mseq[i]]; - } - { - int flag = KSW_XSUBO | KSW_XSTART | (l_mseq * g_mat[0] < 250? KSW_XBYTE : 0) | opt->t; - kswr_t aln; - aln = ksw_align(l_mseq, seq, end - beg, ref, 5, g_mat, opt->q, opt->r, flag, 0); - a->G = aln.score; - a->G2 = aln.score2; - if (a->G < opt->t) a->G = 0; - if (a->G2 < opt->t) a->G2 = 0; - if (a->G2) a->flag |= BSW2_FLAG_TANDEM; - a->k = beg + aln.tb; - a->len = aln.te - aln.tb + 1; - a->beg = aln.qb; - a->end = aln.qe + 1; - /* - printf("[Q] "); for (i = 0; i < l_mseq; ++i) putchar("ACGTN"[(int)seq[i]]); putchar('\n'); - printf("[R] "); for (i = 0; i < end - beg; ++i) putchar("ACGTN"[(int)ref[i]]); putchar('\n'); - printf("G=%d,G2=%d,beg=%d,end=%d,k=%lld,len=%d\n", a->G, a->G2, a->beg, a->end, a->k, a->len); - */ - } - if (a->is_rev) i = a->beg, a->beg = l_mseq - a->end, a->end = l_mseq - i; - free(seq); -} - -void bsw2_pair(const bsw2opt_t *opt, int64_t l_pac, const uint8_t *pac, int n, bsw2seq1_t *seq, bwtsw2_t **hits) -{ - extern int bsw2_resolve_duphits(const bntseq_t *bns, const bwt_t *bwt, bwtsw2_t *b, int IS); - bsw2pestat_t pes; - int i, j, k, n_rescued = 0, n_moved = 0, n_fixed = 0; - int8_t g_mat[25]; - kstring_t msg; - memset(&msg, 0, sizeof(kstring_t)); - pes = bsw2_stat(n, hits, &msg, opt->max_ins); - for (i = k = 0; i < 5; ++i) { - for (j = 0; j < 4; ++j) - g_mat[k++] = i == j? opt->a : -opt->b; - g_mat[k++] = 0; - } - for (i = 0; i < n; i += 2) { - bsw2hit_t a[2]; - memset(&a, 0, sizeof(bsw2hit_t) * 2); - a[0].flag = 1<<6; a[1].flag = 1<<7; - for (j = 0; j < 2; ++j) { // set the read1/2 flag - if (hits[i+j] == 0) continue; - for (k = 0; k < hits[i+j]->n; ++k) { - bsw2hit_t *p = &hits[i+j]->hits[k]; - p->flag |= 1<<(6+j); - } - } - if (pes.failed) continue; - if (hits[i] == 0 || hits[i+1] == 0) continue; // one end has excessive N - if (hits[i]->n != 1 && hits[i+1]->n != 1) continue; // no end has exactly one hit - if (hits[i]->n > 1 || hits[i+1]->n > 1) continue; // one read has more than one hit - if (!opt->skip_sw) { - if (hits[i+0]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+0]->hits[0], seq[i+1].l, seq[i+1].seq, &a[1], g_mat); - if (hits[i+1]->n == 1) bsw2_pair1(opt, l_pac, pac, &pes, &hits[i+1]->hits[0], seq[i+0].l, seq[i+0].seq, &a[0], g_mat); - } // else a[0].G == a[1].G == a[0].G2 == a[1].G2 == 0 - // the following enumerate all possibilities. It is tedious but necessary... - if (hits[i]->n + hits[i+1]->n == 1) { // one end mapped; the other not; - bwtsw2_t *p[2]; - int which; - if (hits[i]->n == 1) p[0] = hits[i], p[1] = hits[i+1], which = 1; - else p[0] = hits[i+1], p[1] = hits[i], which = 0; - if (a[which].G == 0) continue; - a[which].flag |= BSW2_FLAG_RESCUED; - if (p[1]->max == 0) { - p[1]->max = 1; - p[1]->hits = malloc(sizeof(bsw2hit_t)); - } - p[1]->hits[0] = a[which]; - p[1]->n = 1; - p[0]->hits[0].flag |= 2; - p[1]->hits[0].flag |= 2; - ++n_rescued; - } else { // then both ends mapped - int is_fixed = 0; - //fprintf(stderr, "%d; %lld,%lld; %d,%d\n", a[0].is_rev, hits[i]->hits[0].k, a[0].k, hits[i]->hits[0].end, a[0].end); - for (j = 0; j < 2; ++j) { // fix wrong mappings and wrong suboptimal alignment score - bsw2hit_t *p = &hits[i+j]->hits[0]; - if (p->G < a[j].G) { // the orginal mapping is suboptimal - a[j].G2 = a[j].G2 > p->G? a[j].G2 : p->G; // FIXME: reset BSW2_FLAG_TANDEM? - *p = a[j]; - ++n_fixed; - is_fixed = 1; - } else if (p->k != a[j].k && p->G2 < a[j].G) { - p->G2 = a[j].G; - } else if (p->k == a[j].k && p->G2 < a[j].G2) { - p->G2 = a[j].G2; - } - } - if (hits[i]->hits[0].k == a[0].k && hits[i+1]->hits[0].k == a[1].k) { // properly paired and no ends need to be moved - for (j = 0; j < 2; ++j) - hits[i+j]->hits[0].flag |= 2 | (a[j].flag & BSW2_FLAG_TANDEM); - } else if (hits[i]->hits[0].k == a[0].k || hits[i+1]->hits[0].k == a[1].k) { // a tandem match - for (j = 0; j < 2; ++j) { - hits[i+j]->hits[0].flag |= 2; - if (hits[i+j]->hits[0].k != a[j].k) - hits[i+j]->hits[0].flag |= BSW2_FLAG_TANDEM; - } - } else if (!is_fixed && (a[0].G || a[1].G)) { // it is possible to move one end - if (a[0].G && a[1].G) { // now we have two "proper pairs" - int G[2]; - double diff; - G[0] = hits[i]->hits[0].G + a[1].G; - G[1] = hits[i+1]->hits[0].G + a[0].G; - diff = fabs(G[0] - G[1]) / (opt->a + opt->b) / ((hits[i]->hits[0].len + a[1].len + hits[i+1]->hits[0].len + a[0].len) / 2.); - if (diff > 0.05) a[G[0] > G[1]? 0 : 1].G = 0; - } - if (a[0].G == 0 || a[1].G == 0) { // one proper pair only - bsw2hit_t *p[2]; // p[0] points the unchanged hit; p[1] to the hit to be moved - int which, isize; - double dev, diff; - if (a[0].G) p[0] = &hits[i+1]->hits[0], p[1] = &hits[i]->hits[0], which = 0; - else p[0] = &hits[i]->hits[0], p[1] = &hits[i+1]->hits[0], which = 1; - isize = p[0]->is_rev? p[0]->k + p[0]->len - a[which].k : a[which].k + a[which].len - p[0]->k; - dev = fabs(isize - pes.avg) / pes.std; - diff = (double)(p[1]->G - a[which].G) / (opt->a + opt->b) / (p[1]->end - p[1]->beg) * 100.0; - if (diff < dev * 2.) { // then move (heuristic) - a[which].G2 = a[which].G; - p[1][0] = a[which]; - p[1]->flag |= BSW2_FLAG_MOVED | 2; - p[0]->flag |= 2; - ++n_moved; - } - } - } else if (is_fixed) { - hits[i+0]->hits[0].flag |= 2; - hits[i+1]->hits[0].flag |= 2; - } - } - } - ksprintf(&msg, "[%s] #fixed=%d, #rescued=%d, #moved=%d\n", __func__, n_fixed, n_rescued, n_moved); - fputs(msg.s, stderr); - free(msg.s); -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/example.c --- a/bwa-0.7.9a/example.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,64 +0,0 @@ -#include -#include -#include -#include -#include -#include "bwamem.h" -#include "kseq.h" // for the FASTA/Q parser -KSEQ_DECLARE(gzFile) - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -int main(int argc, char *argv[]) -{ - bwaidx_t *idx; - gzFile fp; - kseq_t *ks; - mem_opt_t *opt; - - if (argc < 3) { - fprintf(stderr, "Usage: bwamem-lite \n"); - return 1; - } - - idx = bwa_idx_load(argv[1], BWA_IDX_ALL); // load the BWA index - if (NULL == idx) { - fprintf(stderr, "Index load failed.\n"); - exit(EXIT_FAILURE); - } - fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); - if (NULL == fp) { - fprintf(stderr, "Couldn't open %s : %s\n", - strcmp(argv[2], "-") ? argv[2] : "stdin", - errno ? strerror(errno) : "Out of memory"); - exit(EXIT_FAILURE); - } - ks = kseq_init(fp); // initialize the FASTA/Q parser - opt = mem_opt_init(); // initialize the BWA-MEM parameters to the default values - - while (kseq_read(ks) >= 0) { // read one sequence - mem_alnreg_v ar; - int i, k; - ar = mem_align1(opt, idx->bwt, idx->bns, idx->pac, ks->seq.l, ks->seq.s); // get all the hits - for (i = 0; i < ar.n; ++i) { // traverse each hit - mem_aln_t a; - if (ar.a[i].secondary >= 0) continue; // skip secondary alignments - a = mem_reg2aln(opt, idx->bns, idx->pac, ks->seq.l, ks->seq.s, &ar.a[i]); // get forward-strand position and CIGAR - // print alignment - err_printf("%s\t%c\t%s\t%ld\t%d\t", ks->name.s, "+-"[a.is_rev], idx->bns->anns[a.rid].name, (long)a.pos, a.mapq); - for (k = 0; k < a.n_cigar; ++k) // print CIGAR - err_printf("%d%c", a.cigar[k]>>4, "MIDSH"[a.cigar[k]&0xf]); - err_printf("\t%d\n", a.NM); // print edit distance - free(a.cigar); // don't forget to deallocate CIGAR - } - free(ar.a); // and deallocate the hit list - } - - free(opt); - kseq_destroy(ks); - err_gzclose(fp); - bwa_idx_destroy(idx); - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/fastmap.c --- a/bwa-0.7.9a/fastmap.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,328 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "bwa.h" -#include "bwamem.h" -#include "kvec.h" -#include "utils.h" -#include "kseq.h" -#include "utils.h" -KSEQ_DECLARE(gzFile) - -extern unsigned char nst_nt4_table[256]; - -void *kopen(const char *fn, int *_fd); -int kclose(void *a); - -static void update_a(mem_opt_t *opt, const mem_opt_t *opt0) -{ - if (opt0->a) { // matching score is changed - if (!opt0->b) opt->b *= opt->a; - if (!opt0->T) opt->T *= opt->a; - if (!opt0->o_del) opt->o_del *= opt->a; - if (!opt0->e_del) opt->e_del *= opt->a; - if (!opt0->o_ins) opt->o_ins *= opt->a; - if (!opt0->e_ins) opt->e_ins *= opt->a; - if (!opt0->zdrop) opt->zdrop *= opt->a; - if (!opt0->pen_clip5) opt->pen_clip5 *= opt->a; - if (!opt0->pen_clip3) opt->pen_clip3 *= opt->a; - if (!opt0->pen_unpaired) opt->pen_unpaired *= opt->a; - } -} - -int main_mem(int argc, char *argv[]) -{ - mem_opt_t *opt, opt0; - int fd, fd2, i, c, n, copy_comment = 0; - gzFile fp, fp2 = 0; - kseq_t *ks, *ks2 = 0; - bseq1_t *seqs; - bwaidx_t *idx; - char *p, *rg_line = 0; - const char *mode = 0; - void *ko = 0, *ko2 = 0; - int64_t n_processed = 0; - mem_pestat_t pes[4], *pes0 = 0; - - memset(pes, 0, 4 * sizeof(mem_pestat_t)); - for (i = 0; i < 4; ++i) pes[i].failed = 1; - - opt = mem_opt_init(); - memset(&opt0, 0, sizeof(mem_opt_t)); - while ((c = getopt(argc, argv, "epaFMCSPHYk:c:v:s:r:t:R:A:B:O:E:U:w:L:d:T:Q:D:m:I:N:W:x:G:h:")) >= 0) { - if (c == 'k') opt->min_seed_len = atoi(optarg), opt0.min_seed_len = 1; - else if (c == 'x') mode = optarg; - else if (c == 'w') opt->w = atoi(optarg), opt0.w = 1; - else if (c == 'A') opt->a = atoi(optarg), opt0.a = 1; - else if (c == 'B') opt->b = atoi(optarg), opt0.b = 1; - else if (c == 'T') opt->T = atoi(optarg), opt0.T = 1; - else if (c == 'U') opt->pen_unpaired = atoi(optarg), opt0.pen_unpaired = 1; - else if (c == 't') opt->n_threads = atoi(optarg), opt->n_threads = opt->n_threads > 1? opt->n_threads : 1; - else if (c == 'P') opt->flag |= MEM_F_NOPAIRING; - else if (c == 'a') opt->flag |= MEM_F_ALL; - else if (c == 'p') opt->flag |= MEM_F_PE; - else if (c == 'M') opt->flag |= MEM_F_NO_MULTI; - else if (c == 'S') opt->flag |= MEM_F_NO_RESCUE; - else if (c == 'e') opt->flag |= MEM_F_SELF_OVLP; - else if (c == 'F') opt->flag |= MEM_F_ALN_REG; - else if (c == 'Y') opt->flag |= MEM_F_SOFTCLIP; - else if (c == 'c') opt->max_occ = atoi(optarg), opt0.max_occ = 1; - else if (c == 'd') opt->zdrop = atoi(optarg), opt0.zdrop = 1; - else if (c == 'v') bwa_verbose = atoi(optarg); - else if (c == 'r') opt->split_factor = atof(optarg), opt0.split_factor = 1.; - else if (c == 'D') opt->drop_ratio = atof(optarg), opt0.drop_ratio = 1.; - else if (c == 'm') opt->max_matesw = atoi(optarg), opt0.max_matesw = 1; - else if (c == 'h') opt->max_hits = atoi(optarg), opt0.max_hits = 1; - else if (c == 's') opt->split_width = atoi(optarg), opt0.split_width = 1; - else if (c == 'G') opt->max_chain_gap = atoi(optarg), opt0.max_chain_gap = 1; - else if (c == 'N') opt->max_chain_extend = atoi(optarg), opt0.max_chain_extend = 1; - else if (c == 'W') opt->min_chain_weight = atoi(optarg), opt0.min_chain_weight = 1; - else if (c == 'C') copy_comment = 1; - else if (c == 'Q') { - opt0.mapQ_coef_len = 1; - opt->mapQ_coef_len = atoi(optarg); - opt->mapQ_coef_fac = opt->mapQ_coef_len > 0? log(opt->mapQ_coef_len) : 0; - } else if (c == 'O') { - opt0.o_del = opt0.o_ins = 1; - opt->o_del = opt->o_ins = strtol(optarg, &p, 10); - if (*p != 0 && ispunct(*p) && isdigit(p[1])) - opt->o_ins = strtol(p+1, &p, 10); - } else if (c == 'E') { - opt0.e_del = opt0.e_ins = 1; - opt->e_del = opt->e_ins = strtol(optarg, &p, 10); - if (*p != 0 && ispunct(*p) && isdigit(p[1])) - opt->e_ins = strtol(p+1, &p, 10); - } else if (c == 'L') { - opt0.pen_clip5 = opt0.pen_clip3 = 1; - opt->pen_clip5 = opt->pen_clip3 = strtol(optarg, &p, 10); - if (*p != 0 && ispunct(*p) && isdigit(p[1])) - opt->pen_clip3 = strtol(p+1, &p, 10); - } else if (c == 'R') { - if ((rg_line = bwa_set_rg(optarg)) == 0) return 1; // FIXME: memory leak - } else if (c == 'I') { // specify the insert size distribution - pes0 = pes; - pes[1].failed = 0; - pes[1].avg = strtod(optarg, &p); - pes[1].std = pes[1].avg * .1; - if (*p != 0 && ispunct(*p) && isdigit(p[1])) - pes[1].std = strtod(p+1, &p); - pes[1].high = (int)(pes[1].avg + 4. * pes[1].std + .499); - pes[1].low = (int)(pes[1].avg - 4. * pes[1].std + .499); - if (pes[1].low < 1) pes[1].low = 1; - if (*p != 0 && ispunct(*p) && isdigit(p[1])) - pes[1].high = (int)(strtod(p+1, &p) + .499); - if (*p != 0 && ispunct(*p) && isdigit(p[1])) - pes[1].low = (int)(strtod(p+1, &p) + .499); - if (bwa_verbose >= 3) - fprintf(stderr, "[M::%s] mean insert size: %.3f, stddev: %.3f, max: %d, min: %d\n", - __func__, pes[1].avg, pes[1].std, pes[1].high, pes[1].low); - } - else return 1; - } - if (opt->n_threads < 1) opt->n_threads = 1; - if (optind + 1 >= argc || optind + 3 < argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa mem [options] [in2.fq]\n\n"); - fprintf(stderr, "Algorithm options:\n\n"); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -k INT minimum seed length [%d]\n", opt->min_seed_len); - fprintf(stderr, " -w INT band width for banded alignment [%d]\n", opt->w); - fprintf(stderr, " -d INT off-diagonal X-dropoff [%d]\n", opt->zdrop); - fprintf(stderr, " -r FLOAT look for internal seeds inside a seed longer than {-k} * FLOAT [%g]\n", opt->split_factor); -// fprintf(stderr, " -s INT look for internal seeds inside a seed with less than INT occ [%d]\n", opt->split_width); - fprintf(stderr, " -c INT skip seeds with more than INT occurrences [%d]\n", opt->max_occ); - fprintf(stderr, " -D FLOAT drop chains shorter than FLOAT fraction of the longest overlapping chain [%.2f]\n", opt->drop_ratio); - fprintf(stderr, " -W INT discard a chain if seeded bases shorter than INT [0]\n"); - fprintf(stderr, " -m INT perform at most INT rounds of mate rescues for each read [%d]\n", opt->max_matesw); - fprintf(stderr, " -S skip mate rescue\n"); - fprintf(stderr, " -P skip pairing; mate rescue performed unless -S also in use\n"); - fprintf(stderr, " -e discard full-length exact matches\n"); - fprintf(stderr, " -A INT score for a sequence match, which scales options -TdBOELU unless overridden [%d]\n", opt->a); - fprintf(stderr, " -B INT penalty for a mismatch [%d]\n", opt->b); - fprintf(stderr, " -O INT[,INT] gap open penalties for deletions and insertions [%d,%d]\n", opt->o_del, opt->o_ins); - fprintf(stderr, " -E INT[,INT] gap extension penalty; a gap of size k cost '{-O} + {-E}*k' [%d,%d]\n", opt->e_del, opt->e_ins); - fprintf(stderr, " -L INT[,INT] penalty for 5'- and 3'-end clipping [%d,%d]\n", opt->pen_clip5, opt->pen_clip3); - fprintf(stderr, " -U INT penalty for an unpaired read pair [%d]\n", opt->pen_unpaired); - fprintf(stderr, " -x STR read type. Setting -x changes multiple parameters unless overriden [null]\n"); - fprintf(stderr, " pacbio: -k17 -W40 -r10 -A2 -B5 -O2 -E1 -L0\n"); - fprintf(stderr, " pbread: -k13 -W40 -c1000 -r10 -A2 -B5 -O2 -E1 -N25 -FeaD.001\n"); - fprintf(stderr, "\nInput/output options:\n\n"); - fprintf(stderr, " -p first query file consists of interleaved paired-end sequences\n"); - fprintf(stderr, " -R STR read group header line such as '@RG\\tID:foo\\tSM:bar' [null]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [%d]\n", bwa_verbose); - fprintf(stderr, " -T INT minimum score to output [%d]\n", opt->T); - fprintf(stderr, " -h INT if there are 80%% of the max score, output all in XA [%d]\n", opt->max_hits); - fprintf(stderr, " -a output all alignments for SE or unpaired PE\n"); - fprintf(stderr, " -C append FASTA/FASTQ comment to SAM output\n"); - fprintf(stderr, " -Y use soft clipping for supplementary alignments\n"); - fprintf(stderr, " -M mark shorter split hits as secondary\n\n"); - fprintf(stderr, " -I FLOAT[,FLOAT[,INT[,INT]]]\n"); - fprintf(stderr, " specify the mean, standard deviation (10%% of the mean if absent), max\n"); - fprintf(stderr, " (4 sigma from the mean if absent) and min of the insert size distribution.\n"); - fprintf(stderr, " FR orientation only. [inferred]\n"); - fprintf(stderr, "\n"); - fprintf(stderr, "Note: Please read the man page for detailed description of the command line and options.\n"); - fprintf(stderr, "\n"); - free(opt); - return 1; - } - - if (mode) { - if (strcmp(mode, "pacbio") == 0 || strcmp(mode, "pbref") == 0 || strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { - if (!opt0.a) opt->a = 2, opt0.a = 1; - update_a(opt, &opt0); - if (!opt0.o_del) opt->o_del = 2; - if (!opt0.e_del) opt->e_del = 1; - if (!opt0.o_ins) opt->o_ins = 2; - if (!opt0.e_ins) opt->e_ins = 1; - if (!opt0.b) opt->b = 5; - if (opt0.split_factor == 0.) opt->split_factor = 10.; - if (!opt0.min_chain_weight) opt->min_chain_weight = 40; - if (strcmp(mode, "pbread1") == 0 || strcmp(mode, "pbread") == 0) { - opt->flag |= MEM_F_ALL | MEM_F_SELF_OVLP | MEM_F_ALN_REG; - if (!opt0.max_occ) opt->max_occ = 1000; - if (!opt0.min_seed_len) opt->min_seed_len = 13; - if (!opt0.max_chain_extend) opt->max_chain_extend = 25; - if (opt0.drop_ratio == 0.) opt->drop_ratio = .001; - } else { - if (!opt0.min_seed_len) opt->min_seed_len = 17; - if (!opt0.pen_clip5) opt->pen_clip5 = 0; - if (!opt0.pen_clip3) opt->pen_clip3 = 0; - } - } else { - fprintf(stderr, "[E::%s] unknown read type '%s'\n", __func__, mode); - return 1; // FIXME memory leak - } - } else update_a(opt, &opt0); -// if (opt->T < opt->min_HSP_score) opt->T = opt->min_HSP_score; // TODO: tie ->T to MEM_HSP_COEF - bwa_fill_scmat(opt->a, opt->b, opt->mat); - - if ((idx = bwa_idx_load(argv[optind], BWA_IDX_ALL)) == 0) return 1; // FIXME: memory leak - - ko = kopen(argv[optind + 1], &fd); - if (ko == 0) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 1]); - return 1; - } - fp = gzdopen(fd, "r"); - ks = kseq_init(fp); - if (optind + 2 < argc) { - if (opt->flag&MEM_F_PE) { - if (bwa_verbose >= 2) - fprintf(stderr, "[W::%s] when '-p' is in use, the second query file will be ignored.\n", __func__); - } else { - ko2 = kopen(argv[optind + 2], &fd2); - if (ko2 == 0) { - if (bwa_verbose >= 1) fprintf(stderr, "[E::%s] fail to open file `%s'.\n", __func__, argv[optind + 2]); - return 1; - } - fp2 = gzdopen(fd2, "r"); - ks2 = kseq_init(fp2); - opt->flag |= MEM_F_PE; - } - } - if (!(opt->flag & MEM_F_ALN_REG)) - bwa_print_sam_hdr(idx->bns, rg_line); - while ((seqs = bseq_read(opt->chunk_size * opt->n_threads, &n, ks, ks2)) != 0) { - int64_t size = 0; - if ((opt->flag & MEM_F_PE) && (n&1) == 1) { - if (bwa_verbose >= 2) - fprintf(stderr, "[W::%s] odd number of reads in the PE mode; last read dropped\n", __func__); - n = n>>1<<1; - } - if (!copy_comment) - for (i = 0; i < n; ++i) { - free(seqs[i].comment); seqs[i].comment = 0; - } - for (i = 0; i < n; ++i) size += seqs[i].l_seq; - if (bwa_verbose >= 3) - fprintf(stderr, "[M::%s] read %d sequences (%ld bp)...\n", __func__, n, (long)size); - mem_process_seqs(opt, idx->bwt, idx->bns, idx->pac, n_processed, n, seqs, pes0); - n_processed += n; - for (i = 0; i < n; ++i) { - if (seqs[i].sam) err_fputs(seqs[i].sam, stdout); - free(seqs[i].name); free(seqs[i].comment); free(seqs[i].seq); free(seqs[i].qual); free(seqs[i].sam); - } - free(seqs); - } - - free(opt); - bwa_idx_destroy(idx); - kseq_destroy(ks); - err_gzclose(fp); kclose(ko); - if (ks2) { - kseq_destroy(ks2); - err_gzclose(fp2); kclose(ko2); - } - return 0; -} - -int main_fastmap(int argc, char *argv[]) -{ - int c, i, min_iwidth = 20, min_len = 17, print_seq = 0; - kseq_t *seq; - bwtint_t k; - gzFile fp; - smem_i *itr; - const bwtintv_v *a; - bwaidx_t *idx; - - while ((c = getopt(argc, argv, "w:l:p")) >= 0) { - switch (c) { - case 'p': print_seq = 1; break; - case 'w': min_iwidth = atoi(optarg); break; - case 'l': min_len = atoi(optarg); break; - default: return 1; - } - } - if (optind + 1 >= argc) { - fprintf(stderr, "Usage: bwa fastmap [-p] [-l minLen=%d] [-w maxSaSize=%d] \n", min_len, min_iwidth); - return 1; - } - - fp = xzopen(argv[optind + 1], "r"); - seq = kseq_init(fp); - if ((idx = bwa_idx_load(argv[optind], BWA_IDX_BWT|BWA_IDX_BNS)) == 0) return 1; - itr = smem_itr_init(idx->bwt); - while (kseq_read(seq) >= 0) { - err_printf("SQ\t%s\t%ld", seq->name.s, seq->seq.l); - if (print_seq) { - err_putchar('\t'); - err_puts(seq->seq.s); - } else err_putchar('\n'); - for (i = 0; i < seq->seq.l; ++i) - seq->seq.s[i] = nst_nt4_table[(int)seq->seq.s[i]]; - smem_set_query(itr, seq->seq.l, (uint8_t*)seq->seq.s); - while ((a = smem_next(itr)) != 0) { - for (i = 0; i < a->n; ++i) { - bwtintv_t *p = &a->a[i]; - if ((uint32_t)p->info - (p->info>>32) < min_len) continue; - err_printf("EM\t%d\t%d\t%ld", (uint32_t)(p->info>>32), (uint32_t)p->info, (long)p->x[2]); - if (p->x[2] <= min_iwidth) { - for (k = 0; k < p->x[2]; ++k) { - bwtint_t pos; - int len, is_rev, ref_id; - len = (uint32_t)p->info - (p->info>>32); - pos = bns_depos(idx->bns, bwt_sa(idx->bwt, p->x[0] + k), &is_rev); - if (is_rev) pos -= len - 1; - bns_cnt_ambi(idx->bns, pos, len, &ref_id); - err_printf("\t%s:%c%ld", idx->bns->anns[ref_id].name, "+-"[is_rev], (long)(pos - idx->bns->anns[ref_id].offset) + 1); - } - } else err_puts("\t*"); - err_putchar('\n'); - } - } - err_puts("//"); - } - - smem_itr_destroy(itr); - bwa_idx_destroy(idx); - kseq_destroy(seq); - err_gzclose(fp); - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/is.c --- a/bwa-0.7.9a/is.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,223 +0,0 @@ -/* - * sais.c for sais-lite - * Copyright (c) 2008 Yuta Mori All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, - * copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following - * conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES - * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT - * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, - * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR - * OTHER DEALINGS IN THE SOFTWARE. - */ - -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -typedef unsigned char ubyte_t; -#define chr(i) (cs == sizeof(int) ? ((const int *)T)[i]:((const unsigned char *)T)[i]) - -/* find the start or end of each bucket */ -static void getCounts(const unsigned char *T, int *C, int n, int k, int cs) -{ - int i; - for (i = 0; i < k; ++i) C[i] = 0; - for (i = 0; i < n; ++i) ++C[chr(i)]; -} -static void getBuckets(const int *C, int *B, int k, int end) -{ - int i, sum = 0; - if (end) { - for (i = 0; i < k; ++i) { - sum += C[i]; - B[i] = sum; - } - } else { - for (i = 0; i < k; ++i) { - sum += C[i]; - B[i] = sum - C[i]; - } - } -} - -/* compute SA */ -static void induceSA(const unsigned char *T, int *SA, int *C, int *B, int n, int k, int cs) -{ - int *b, i, j; - int c0, c1; - /* compute SAl */ - if (C == B) getCounts(T, C, n, k, cs); - getBuckets(C, B, k, 0); /* find starts of buckets */ - j = n - 1; - b = SA + B[c1 = chr(j)]; - *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; - for (i = 0; i < n; ++i) { - j = SA[i], SA[i] = ~j; - if (0 < j) { - --j; - if ((c0 = chr(j)) != c1) { - B[c1] = b - SA; - b = SA + B[c1 = c0]; - } - *b++ = ((0 < j) && (chr(j - 1) < c1)) ? ~j : j; - } - } - /* compute SAs */ - if (C == B) getCounts(T, C, n, k, cs); - getBuckets(C, B, k, 1); /* find ends of buckets */ - for (i = n - 1, b = SA + B[c1 = 0]; 0 <= i; --i) { - if (0 < (j = SA[i])) { - --j; - if ((c0 = chr(j)) != c1) { - B[c1] = b - SA; - b = SA + B[c1 = c0]; - } - *--b = ((j == 0) || (chr(j - 1) > c1)) ? ~j : j; - } else SA[i] = ~j; - } -} - -/* - * find the suffix array SA of T[0..n-1] in {0..k-1}^n use a working - * space (excluding T and SA) of at most 2n+O(1) for a constant alphabet - */ -static int sais_main(const unsigned char *T, int *SA, int fs, int n, int k, int cs) -{ - int *C, *B, *RA; - int i, j, c, m, p, q, plen, qlen, name; - int c0, c1; - int diff; - - /* stage 1: reduce the problem by at least 1/2 sort all the - * S-substrings */ - if (k <= fs) { - C = SA + n; - B = (k <= (fs - k)) ? C + k : C; - } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; - getCounts(T, C, n, k, cs); - getBuckets(C, B, k, 1); /* find ends of buckets */ - for (i = 0; i < n; ++i) SA[i] = 0; - for (i = n - 2, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { - if ((c0 = chr(i)) < (c1 + c)) c = 1; - else if (c != 0) SA[--B[c1]] = i + 1, c = 0; - } - induceSA(T, SA, C, B, n, k, cs); - if (fs < k) free(C); - /* compact all the sorted substrings into the first m items of SA - * 2*m must be not larger than n (proveable) */ - for (i = 0, m = 0; i < n; ++i) { - p = SA[i]; - if ((0 < p) && (chr(p - 1) > (c0 = chr(p)))) { - for (j = p + 1; (j < n) && (c0 == (c1 = chr(j))); ++j); - if ((j < n) && (c0 < c1)) SA[m++] = p; - } - } - for (i = m; i < n; ++i) SA[i] = 0; /* init the name array buffer */ - /* store the length of all substrings */ - for (i = n - 2, j = n, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { - if ((c0 = chr(i)) < (c1 + c)) c = 1; - else if (c != 0) { - SA[m + ((i + 1) >> 1)] = j - i - 1; - j = i + 1; - c = 0; - } - } - /* find the lexicographic names of all substrings */ - for (i = 0, name = 0, q = n, qlen = 0; i < m; ++i) { - p = SA[i], plen = SA[m + (p >> 1)], diff = 1; - if (plen == qlen) { - for (j = 0; (j < plen) && (chr(p + j) == chr(q + j)); j++); - if (j == plen) diff = 0; - } - if (diff != 0) ++name, q = p, qlen = plen; - SA[m + (p >> 1)] = name; - } - - /* stage 2: solve the reduced problem recurse if names are not yet - * unique */ - if (name < m) { - RA = SA + n + fs - m; - for (i = n - 1, j = m - 1; m <= i; --i) { - if (SA[i] != 0) RA[j--] = SA[i] - 1; - } - if (sais_main((unsigned char *) RA, SA, fs + n - m * 2, m, name, sizeof(int)) != 0) return -2; - for (i = n - 2, j = m - 1, c = 0, c1 = chr(n - 1); 0 <= i; --i, c1 = c0) { - if ((c0 = chr(i)) < (c1 + c)) c = 1; - else if (c != 0) RA[j--] = i + 1, c = 0; /* get p1 */ - } - for (i = 0; i < m; ++i) SA[i] = RA[SA[i]]; /* get index */ - } - /* stage 3: induce the result for the original problem */ - if (k <= fs) { - C = SA + n; - B = (k <= (fs - k)) ? C + k : C; - } else if ((C = B = (int *) malloc(k * sizeof(int))) == NULL) return -2; - /* put all left-most S characters into their buckets */ - getCounts(T, C, n, k, cs); - getBuckets(C, B, k, 1); /* find ends of buckets */ - for (i = m; i < n; ++i) SA[i] = 0; /* init SA[m..n-1] */ - for (i = m - 1; 0 <= i; --i) { - j = SA[i], SA[i] = 0; - SA[--B[chr(j)]] = j; - } - induceSA(T, SA, C, B, n, k, cs); - if (fs < k) free(C); - return 0; -} - -/** - * Constructs the suffix array of a given string. - * @param T[0..n-1] The input string. - * @param SA[0..n] The output array of suffixes. - * @param n The length of the given string. - * @return 0 if no error occurred - */ -int is_sa(const ubyte_t *T, int *SA, int n) -{ - if ((T == NULL) || (SA == NULL) || (n < 0)) return -1; - SA[0] = n; - if (n <= 1) { - if (n == 1) SA[1] = 0; - return 0; - } - return sais_main(T, SA+1, 0, n, 256, 1); -} - -/** - * Constructs the burrows-wheeler transformed string of a given string. - * @param T[0..n-1] The input string. - * @param n The length of the given string. - * @return The primary index if no error occurred, -1 or -2 otherwise. - */ -int is_bwt(ubyte_t *T, int n) -{ - int *SA, i, primary = 0; - SA = (int*)calloc(n+1, sizeof(int)); - - if (is_sa(T, SA, n)) return -1; - - for (i = 0; i <= n; ++i) { - if (SA[i] == 0) primary = i; - else SA[i] = T[SA[i] - 1]; - } - for (i = 0; i < primary; ++i) T[i] = SA[i]; - for (; i < n; ++i) T[i] = SA[i + 1]; - free(SA); - return primary; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/kbtree.h --- a/bwa-0.7.9a/kbtree.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,388 +0,0 @@ -/*- - * Copyright 1997-1999, 2001, John-Mark Gurney. - * 2008-2009, Attractive Chaos - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#ifndef __AC_KBTREE_H -#define __AC_KBTREE_H - -#include -#include -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -typedef struct { - int32_t is_internal:1, n:31; -} kbnode_t; - -#define __KB_KEY(type, x) ((type*)((char*)x + 4)) -#define __KB_PTR(btr, x) ((kbnode_t**)((char*)x + btr->off_ptr)) - -#define __KB_TREE_T(name) \ - typedef struct { \ - kbnode_t *root; \ - int off_key, off_ptr, ilen, elen; \ - int n, t; \ - int n_keys, n_nodes; \ - } kbtree_##name##_t; - -#define __KB_INIT(name, key_t) \ - kbtree_##name##_t *kb_init_##name(int size) \ - { \ - kbtree_##name##_t *b; \ - b = (kbtree_##name##_t*)calloc(1, sizeof(kbtree_##name##_t)); \ - b->t = ((size - 4 - sizeof(void*)) / (sizeof(void*) + sizeof(key_t)) + 1) >> 1; \ - if (b->t < 2) { \ - free(b); return 0; \ - } \ - b->n = 2 * b->t - 1; \ - b->off_ptr = 4 + b->n * sizeof(key_t); \ - b->ilen = (4 + sizeof(void*) + b->n * (sizeof(void*) + sizeof(key_t)) + 3) >> 2 << 2; \ - b->elen = (b->off_ptr + 3) >> 2 << 2; \ - b->root = (kbnode_t*)calloc(1, b->ilen); \ - ++b->n_nodes; \ - return b; \ - } - -#define __kb_destroy(b) do { \ - int i, max = 8; \ - kbnode_t *x, **top, **stack = 0; \ - if (b) { \ - top = stack = (kbnode_t**)calloc(max, sizeof(kbnode_t*)); \ - *top++ = (b)->root; \ - while (top != stack) { \ - x = *--top; \ - if (x == 0 || x->is_internal == 0) { free(x); continue; } \ - for (i = 0; i <= x->n; ++i) \ - if (__KB_PTR(b, x)[i]) { \ - if (top - stack == max) { \ - max <<= 1; \ - stack = (kbnode_t**)realloc(stack, max * sizeof(kbnode_t*)); \ - top = stack + (max>>1); \ - } \ - *top++ = __KB_PTR(b, x)[i]; \ - } \ - free(x); \ - } \ - } \ - free(b); free(stack); \ - } while (0) - -#define __kb_get_first(key_t, b, ret) do { \ - kbnode_t *__x = (b)->root; \ - while (__KB_PTR(b, __x)[0] != 0) \ - __x = __KB_PTR(b, __x)[0]; \ - (ret) = __KB_KEY(key_t, __x)[0]; \ - } while (0) - -#define __KB_GET_AUX0(name, key_t, __cmp) \ - static inline int __kb_get_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ - { \ - int tr, *rr, begin, end, n = x->n >> 1; \ - if (x->n == 0) return -1; \ - if (__cmp(*k, __KB_KEY(key_t, x)[n]) < 0) { \ - begin = 0; end = n; \ - } else { begin = n; end = x->n - 1; } \ - rr = r? r : &tr; \ - n = end; \ - while (n >= begin && (*rr = __cmp(*k, __KB_KEY(key_t, x)[n])) < 0) --n; \ - return n; \ - } - -#define __KB_GET_AUX1(name, key_t, __cmp) \ - static inline int __kb_getp_aux_##name(const kbnode_t * __restrict x, const key_t * __restrict k, int *r) \ - { \ - int tr, *rr, begin = 0, end = x->n; \ - if (x->n == 0) return -1; \ - rr = r? r : &tr; \ - while (begin < end) { \ - int mid = (begin + end) >> 1; \ - if (__cmp(__KB_KEY(key_t, x)[mid], *k) < 0) begin = mid + 1; \ - else end = mid; \ - } \ - if (begin == x->n) { *rr = 1; return x->n - 1; } \ - if ((*rr = __cmp(*k, __KB_KEY(key_t, x)[begin])) < 0) --begin; \ - return begin; \ - } - -#define __KB_GET(name, key_t) \ - static key_t *kb_getp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ - { \ - int i, r = 0; \ - kbnode_t *x = b->root; \ - while (x) { \ - i = __kb_getp_aux_##name(x, k, &r); \ - if (i >= 0 && r == 0) return &__KB_KEY(key_t, x)[i]; \ - if (x->is_internal == 0) return 0; \ - x = __KB_PTR(b, x)[i + 1]; \ - } \ - return 0; \ - } \ - static inline key_t *kb_get_##name(kbtree_##name##_t *b, const key_t k) \ - { \ - return kb_getp_##name(b, &k); \ - } - -#define __KB_INTERVAL(name, key_t) \ - static void kb_intervalp_##name(kbtree_##name##_t *b, const key_t * __restrict k, key_t **lower, key_t **upper) \ - { \ - int i, r = 0; \ - kbnode_t *x = b->root; \ - *lower = *upper = 0; \ - while (x) { \ - i = __kb_getp_aux_##name(x, k, &r); \ - if (i >= 0 && r == 0) { \ - *lower = *upper = &__KB_KEY(key_t, x)[i]; \ - return; \ - } \ - if (i >= 0) *lower = &__KB_KEY(key_t, x)[i]; \ - if (i < x->n - 1) *upper = &__KB_KEY(key_t, x)[i + 1]; \ - if (x->is_internal == 0) return; \ - x = __KB_PTR(b, x)[i + 1]; \ - } \ - } \ - static inline void kb_interval_##name(kbtree_##name##_t *b, const key_t k, key_t **lower, key_t **upper) \ - { \ - kb_intervalp_##name(b, &k, lower, upper); \ - } - -#define __KB_PUT(name, key_t, __cmp) \ - /* x must be an internal node */ \ - static void __kb_split_##name(kbtree_##name##_t *b, kbnode_t *x, int i, kbnode_t *y) \ - { \ - kbnode_t *z; \ - z = (kbnode_t*)calloc(1, y->is_internal? b->ilen : b->elen); \ - ++b->n_nodes; \ - z->is_internal = y->is_internal; \ - z->n = b->t - 1; \ - memcpy(__KB_KEY(key_t, z), __KB_KEY(key_t, y) + b->t, sizeof(key_t) * (b->t - 1)); \ - if (y->is_internal) memcpy(__KB_PTR(b, z), __KB_PTR(b, y) + b->t, sizeof(void*) * b->t); \ - y->n = b->t - 1; \ - memmove(__KB_PTR(b, x) + i + 2, __KB_PTR(b, x) + i + 1, sizeof(void*) * (x->n - i)); \ - __KB_PTR(b, x)[i + 1] = z; \ - memmove(__KB_KEY(key_t, x) + i + 1, __KB_KEY(key_t, x) + i, sizeof(key_t) * (x->n - i)); \ - __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[b->t - 1]; \ - ++x->n; \ - } \ - static void __kb_putp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k) \ - { \ - int i = x->n - 1; \ - if (x->is_internal == 0) { \ - i = __kb_getp_aux_##name(x, k, 0); \ - if (i != x->n - 1) \ - memmove(__KB_KEY(key_t, x) + i + 2, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ - __KB_KEY(key_t, x)[i + 1] = *k; \ - ++x->n; \ - } else { \ - i = __kb_getp_aux_##name(x, k, 0) + 1; \ - if (__KB_PTR(b, x)[i]->n == 2 * b->t - 1) { \ - __kb_split_##name(b, x, i, __KB_PTR(b, x)[i]); \ - if (__cmp(*k, __KB_KEY(key_t, x)[i]) > 0) ++i; \ - } \ - __kb_putp_aux_##name(b, __KB_PTR(b, x)[i], k); \ - } \ - } \ - static void kb_putp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ - { \ - kbnode_t *r, *s; \ - ++b->n_keys; \ - r = b->root; \ - if (r->n == 2 * b->t - 1) { \ - ++b->n_nodes; \ - s = (kbnode_t*)calloc(1, b->ilen); \ - b->root = s; s->is_internal = 1; s->n = 0; \ - __KB_PTR(b, s)[0] = r; \ - __kb_split_##name(b, s, 0, r); \ - r = s; \ - } \ - __kb_putp_aux_##name(b, r, k); \ - } \ - static inline void kb_put_##name(kbtree_##name##_t *b, const key_t k) \ - { \ - kb_putp_##name(b, &k); \ - } - - -#define __KB_DEL(name, key_t) \ - static key_t __kb_delp_aux_##name(kbtree_##name##_t *b, kbnode_t *x, const key_t * __restrict k, int s) \ - { \ - int yn, zn, i, r = 0; \ - kbnode_t *xp, *y, *z; \ - key_t kp; \ - if (x == 0) return *k; \ - if (s) { /* s can only be 0, 1 or 2 */ \ - r = x->is_internal == 0? 0 : s == 1? 1 : -1; \ - i = s == 1? x->n - 1 : -1; \ - } else i = __kb_getp_aux_##name(x, k, &r); \ - if (x->is_internal == 0) { \ - if (s == 2) ++i; \ - kp = __KB_KEY(key_t, x)[i]; \ - memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ - --x->n; \ - return kp; \ - } \ - if (r == 0) { \ - if ((yn = __KB_PTR(b, x)[i]->n) >= b->t) { \ - xp = __KB_PTR(b, x)[i]; \ - kp = __KB_KEY(key_t, x)[i]; \ - __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 1); \ - return kp; \ - } else if ((zn = __KB_PTR(b, x)[i + 1]->n) >= b->t) { \ - xp = __KB_PTR(b, x)[i + 1]; \ - kp = __KB_KEY(key_t, x)[i]; \ - __KB_KEY(key_t, x)[i] = __kb_delp_aux_##name(b, xp, 0, 2); \ - return kp; \ - } else if (yn == b->t - 1 && zn == b->t - 1) { \ - y = __KB_PTR(b, x)[i]; z = __KB_PTR(b, x)[i + 1]; \ - __KB_KEY(key_t, y)[y->n++] = *k; \ - memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, z), z->n * sizeof(key_t)); \ - if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, z), (z->n + 1) * sizeof(void*)); \ - y->n += z->n; \ - memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ - memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ - --x->n; \ - free(z); \ - return __kb_delp_aux_##name(b, y, k, s); \ - } \ - } \ - ++i; \ - if ((xp = __KB_PTR(b, x)[i])->n == b->t - 1) { \ - if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n >= b->t) { \ - memmove(__KB_KEY(key_t, xp) + 1, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ - if (xp->is_internal) memmove(__KB_PTR(b, xp) + 1, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ - __KB_KEY(key_t, xp)[0] = __KB_KEY(key_t, x)[i - 1]; \ - __KB_KEY(key_t, x)[i - 1] = __KB_KEY(key_t, y)[y->n - 1]; \ - if (xp->is_internal) __KB_PTR(b, xp)[0] = __KB_PTR(b, y)[y->n]; \ - --y->n; ++xp->n; \ - } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n >= b->t) { \ - __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ - __KB_KEY(key_t, x)[i] = __KB_KEY(key_t, y)[0]; \ - if (xp->is_internal) __KB_PTR(b, xp)[xp->n] = __KB_PTR(b, y)[0]; \ - --y->n; \ - memmove(__KB_KEY(key_t, y), __KB_KEY(key_t, y) + 1, y->n * sizeof(key_t)); \ - if (y->is_internal) memmove(__KB_PTR(b, y), __KB_PTR(b, y) + 1, (y->n + 1) * sizeof(void*)); \ - } else if (i > 0 && (y = __KB_PTR(b, x)[i - 1])->n == b->t - 1) { \ - __KB_KEY(key_t, y)[y->n++] = __KB_KEY(key_t, x)[i - 1]; \ - memmove(__KB_KEY(key_t, y) + y->n, __KB_KEY(key_t, xp), xp->n * sizeof(key_t)); \ - if (y->is_internal) memmove(__KB_PTR(b, y) + y->n, __KB_PTR(b, xp), (xp->n + 1) * sizeof(void*)); \ - y->n += xp->n; \ - memmove(__KB_KEY(key_t, x) + i - 1, __KB_KEY(key_t, x) + i, (x->n - i) * sizeof(key_t)); \ - memmove(__KB_PTR(b, x) + i, __KB_PTR(b, x) + i + 1, (x->n - i) * sizeof(void*)); \ - --x->n; \ - free(xp); \ - xp = y; \ - } else if (i < x->n && (y = __KB_PTR(b, x)[i + 1])->n == b->t - 1) { \ - __KB_KEY(key_t, xp)[xp->n++] = __KB_KEY(key_t, x)[i]; \ - memmove(__KB_KEY(key_t, xp) + xp->n, __KB_KEY(key_t, y), y->n * sizeof(key_t)); \ - if (xp->is_internal) memmove(__KB_PTR(b, xp) + xp->n, __KB_PTR(b, y), (y->n + 1) * sizeof(void*)); \ - xp->n += y->n; \ - memmove(__KB_KEY(key_t, x) + i, __KB_KEY(key_t, x) + i + 1, (x->n - i - 1) * sizeof(key_t)); \ - memmove(__KB_PTR(b, x) + i + 1, __KB_PTR(b, x) + i + 2, (x->n - i - 1) * sizeof(void*)); \ - --x->n; \ - free(y); \ - } \ - } \ - return __kb_delp_aux_##name(b, xp, k, s); \ - } \ - static key_t kb_delp_##name(kbtree_##name##_t *b, const key_t * __restrict k) \ - { \ - kbnode_t *x; \ - key_t ret; \ - ret = __kb_delp_aux_##name(b, b->root, k, 0); \ - --b->n_keys; \ - if (b->root->n == 0 && b->root->is_internal) { \ - --b->n_nodes; \ - x = b->root; \ - b->root = __KB_PTR(b, x)[0]; \ - free(x); \ - } \ - return ret; \ - } \ - static inline key_t kb_del_##name(kbtree_##name##_t *b, const key_t k) \ - { \ - return kb_delp_##name(b, &k); \ - } - -typedef struct { - kbnode_t *x; - int i; -} __kbstack_t; - -#define __kb_traverse(key_t, b, __func) do { \ - int __kmax = 8; \ - __kbstack_t *__kstack, *__kp; \ - __kp = __kstack = (__kbstack_t*)calloc(__kmax, sizeof(__kbstack_t)); \ - __kp->x = (b)->root; __kp->i = 0; \ - for (;;) { \ - while (__kp->x && __kp->i <= __kp->x->n) { \ - if (__kp - __kstack == __kmax - 1) { \ - __kmax <<= 1; \ - __kstack = (__kbstack_t*)realloc(__kstack, __kmax * sizeof(__kbstack_t)); \ - __kp = __kstack + (__kmax>>1) - 1; \ - } \ - (__kp+1)->i = 0; (__kp+1)->x = __kp->x->is_internal? __KB_PTR(b, __kp->x)[__kp->i] : 0; \ - ++__kp; \ - } \ - --__kp; \ - if (__kp >= __kstack) { \ - if (__kp->x && __kp->i < __kp->x->n) __func(&__KB_KEY(key_t, __kp->x)[__kp->i]); \ - ++__kp->i; \ - } else break; \ - } \ - free(__kstack); \ - } while (0) - -#define KBTREE_INIT(name, key_t, __cmp) \ - __KB_TREE_T(name) \ - __KB_INIT(name, key_t) \ - __KB_GET_AUX1(name, key_t, __cmp) \ - __KB_GET(name, key_t) \ - __KB_INTERVAL(name, key_t) \ - __KB_PUT(name, key_t, __cmp) \ - __KB_DEL(name, key_t) - -#define KB_DEFAULT_SIZE 512 - -#define kbtree_t(name) kbtree_##name##_t -#define kb_init(name, s) kb_init_##name(s) -#define kb_destroy(name, b) __kb_destroy(b) -#define kb_get(name, b, k) kb_get_##name(b, k) -#define kb_put(name, b, k) kb_put_##name(b, k) -#define kb_del(name, b, k) kb_del_##name(b, k) -#define kb_interval(name, b, k, l, u) kb_interval_##name(b, k, l, u) -#define kb_getp(name, b, k) kb_getp_##name(b, k) -#define kb_putp(name, b, k) kb_putp_##name(b, k) -#define kb_delp(name, b, k) kb_delp_##name(b, k) -#define kb_intervalp(name, b, k, l, u) kb_intervalp_##name(b, k, l, u) - -#define kb_size(b) ((b)->n_keys) - -#define kb_generic_cmp(a, b) (((b) < (a)) - ((a) < (b))) -#define kb_str_cmp(a, b) strcmp(a, b) - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/khash.h --- a/bwa-0.7.9a/khash.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,614 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - An example: - -#include "khash.h" -KHASH_MAP_INIT_INT(32, char) -int main() { - int ret, is_missing; - khiter_t k; - khash_t(32) *h = kh_init(32); - k = kh_put(32, h, 5, &ret); - kh_value(h, k) = 10; - k = kh_get(32, h, 10); - is_missing = (k == kh_end(h)); - k = kh_get(32, h, 5); - kh_del(32, h, k); - for (k = kh_begin(h); k != kh_end(h); ++k) - if (kh_exist(h, k)) kh_value(h, k) = 1; - kh_destroy(32, h); - return 0; -} -*/ - -/* - 2011-12-29 (0.2.7): - - * Minor code clean up; no actual effect. - - 2011-09-16 (0.2.6): - - * The capacity is a power of 2. This seems to dramatically improve the - speed for simple keys. Thank Zilong Tan for the suggestion. Reference: - - - http://code.google.com/p/ulib/ - - http://nothings.org/computer/judy/ - - * Allow to optionally use linear probing which usually has better - performance for random input. Double hashing is still the default as it - is more robust to certain non-random input. - - * Added Wang's integer hash function (not used by default). This hash - function is more robust to certain non-random input. - - 2011-02-14 (0.2.5): - - * Allow to declare global functions. - - 2009-09-26 (0.2.4): - - * Improve portability - - 2008-09-19 (0.2.3): - - * Corrected the example - * Improved interfaces - - 2008-09-11 (0.2.2): - - * Improved speed a little in kh_put() - - 2008-09-10 (0.2.1): - - * Added kh_clear() - * Fixed a compiling error - - 2008-09-02 (0.2.0): - - * Changed to token concatenation which increases flexibility. - - 2008-08-31 (0.1.2): - - * Fixed a bug in kh_get(), which has not been tested previously. - - 2008-08-31 (0.1.1): - - * Added destructor -*/ - - -#ifndef __AC_KHASH_H -#define __AC_KHASH_H - -/*! - @header - - Generic hash table library. - */ - -#define AC_VERSION_KHASH_H "0.2.6" - -#include -#include -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -/* compipler specific configuration */ - -#if UINT_MAX == 0xffffffffu -typedef unsigned int khint32_t; -#elif ULONG_MAX == 0xffffffffu -typedef unsigned long khint32_t; -#endif - -#if ULONG_MAX == ULLONG_MAX -typedef unsigned long khint64_t; -#else -typedef unsigned long long khint64_t; -#endif - -#ifdef _MSC_VER -#define kh_inline __inline -#else -#define kh_inline inline -#endif - -typedef khint32_t khint_t; -typedef khint_t khiter_t; - -#define __ac_isempty(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&2) -#define __ac_isdel(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&1) -#define __ac_iseither(flag, i) ((flag[i>>4]>>((i&0xfU)<<1))&3) -#define __ac_set_isdel_false(flag, i) (flag[i>>4]&=~(1ul<<((i&0xfU)<<1))) -#define __ac_set_isempty_false(flag, i) (flag[i>>4]&=~(2ul<<((i&0xfU)<<1))) -#define __ac_set_isboth_false(flag, i) (flag[i>>4]&=~(3ul<<((i&0xfU)<<1))) -#define __ac_set_isdel_true(flag, i) (flag[i>>4]|=1ul<<((i&0xfU)<<1)) - -#ifdef KHASH_LINEAR -#define __ac_inc(k, m) 1 -#else -#define __ac_inc(k, m) (((k)>>3 ^ (k)<<3) | 1) & (m) -#endif - -#define __ac_fsize(m) ((m) < 16? 1 : (m)>>4) - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#ifndef kcalloc -#define kcalloc(N,Z) calloc(N,Z) -#endif -#ifndef kmalloc -#define kmalloc(Z) malloc(Z) -#endif -#ifndef krealloc -#define krealloc(P,Z) realloc(P,Z) -#endif -#ifndef kfree -#define kfree(P) free(P) -#endif - -static const double __ac_HASH_UPPER = 0.77; - -#define __KHASH_TYPE(name, khkey_t, khval_t) \ - typedef struct { \ - khint_t n_buckets, size, n_occupied, upper_bound; \ - khint32_t *flags; \ - khkey_t *keys; \ - khval_t *vals; \ - } kh_##name##_t; - -#define __KHASH_PROTOTYPES(name, khkey_t, khval_t) \ - extern kh_##name##_t *kh_init_##name(void); \ - extern void kh_destroy_##name(kh_##name##_t *h); \ - extern void kh_clear_##name(kh_##name##_t *h); \ - extern khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key); \ - extern int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets); \ - extern khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret); \ - extern void kh_del_##name(kh_##name##_t *h, khint_t x); - -#define __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - SCOPE kh_##name##_t *kh_init_##name(void) { \ - return (kh_##name##_t*)kcalloc(1, sizeof(kh_##name##_t)); \ - } \ - SCOPE void kh_destroy_##name(kh_##name##_t *h) \ - { \ - if (h) { \ - kfree((void *)h->keys); kfree(h->flags); \ - kfree((void *)h->vals); \ - kfree(h); \ - } \ - } \ - SCOPE void kh_clear_##name(kh_##name##_t *h) \ - { \ - if (h && h->flags) { \ - memset(h->flags, 0xaa, __ac_fsize(h->n_buckets) * sizeof(khint32_t)); \ - h->size = h->n_occupied = 0; \ - } \ - } \ - SCOPE khint_t kh_get_##name(const kh_##name##_t *h, khkey_t key) \ - { \ - if (h->n_buckets) { \ - khint_t inc, k, i, last, mask; \ - mask = h->n_buckets - 1; \ - k = __hash_func(key); i = k & mask; \ - inc = __ac_inc(k, mask); last = i; /* inc==1 for linear probing */ \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - i = (i + inc) & mask; \ - if (i == last) return h->n_buckets; \ - } \ - return __ac_iseither(h->flags, i)? h->n_buckets : i; \ - } else return 0; \ - } \ - SCOPE int kh_resize_##name(kh_##name##_t *h, khint_t new_n_buckets) \ - { /* This function uses 0.25*n_bucktes bytes of working space instead of [sizeof(key_t+val_t)+.25]*n_buckets. */ \ - khint32_t *new_flags = 0; \ - khint_t j = 1; \ - { \ - kroundup32(new_n_buckets); \ - if (new_n_buckets < 4) new_n_buckets = 4; \ - if (h->size >= (khint_t)(new_n_buckets * __ac_HASH_UPPER + 0.5)) j = 0; /* requested size is too small */ \ - else { /* hash table size to be changed (shrink or expand); rehash */ \ - new_flags = (khint32_t*)kmalloc(__ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (!new_flags) return -1; \ - memset(new_flags, 0xaa, __ac_fsize(new_n_buckets) * sizeof(khint32_t)); \ - if (h->n_buckets < new_n_buckets) { /* expand */ \ - khkey_t *new_keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (!new_keys) return -1; \ - h->keys = new_keys; \ - if (kh_is_map) { \ - khval_t *new_vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - if (!new_vals) return -1; \ - h->vals = new_vals; \ - } \ - } /* otherwise shrink */ \ - } \ - } \ - if (j) { /* rehashing is needed */ \ - for (j = 0; j != h->n_buckets; ++j) { \ - if (__ac_iseither(h->flags, j) == 0) { \ - khkey_t key = h->keys[j]; \ - khval_t val; \ - khint_t new_mask; \ - new_mask = new_n_buckets - 1; \ - if (kh_is_map) val = h->vals[j]; \ - __ac_set_isdel_true(h->flags, j); \ - while (1) { /* kick-out process; sort of like in Cuckoo hashing */ \ - khint_t inc, k, i; \ - k = __hash_func(key); \ - i = k & new_mask; \ - inc = __ac_inc(k, new_mask); \ - while (!__ac_isempty(new_flags, i)) i = (i + inc) & new_mask; \ - __ac_set_isempty_false(new_flags, i); \ - if (i < h->n_buckets && __ac_iseither(h->flags, i) == 0) { /* kick out the existing element */ \ - { khkey_t tmp = h->keys[i]; h->keys[i] = key; key = tmp; } \ - if (kh_is_map) { khval_t tmp = h->vals[i]; h->vals[i] = val; val = tmp; } \ - __ac_set_isdel_true(h->flags, i); /* mark it as deleted in the old hash table */ \ - } else { /* write the element and jump out of the loop */ \ - h->keys[i] = key; \ - if (kh_is_map) h->vals[i] = val; \ - break; \ - } \ - } \ - } \ - } \ - if (h->n_buckets > new_n_buckets) { /* shrink the hash table */ \ - h->keys = (khkey_t*)krealloc((void *)h->keys, new_n_buckets * sizeof(khkey_t)); \ - if (kh_is_map) h->vals = (khval_t*)krealloc((void *)h->vals, new_n_buckets * sizeof(khval_t)); \ - } \ - kfree(h->flags); /* free the working space */ \ - h->flags = new_flags; \ - h->n_buckets = new_n_buckets; \ - h->n_occupied = h->size; \ - h->upper_bound = (khint_t)(h->n_buckets * __ac_HASH_UPPER + 0.5); \ - } \ - return 0; \ - } \ - SCOPE khint_t kh_put_##name(kh_##name##_t *h, khkey_t key, int *ret) \ - { \ - khint_t x; \ - if (h->n_occupied >= h->upper_bound) { /* update the hash table */ \ - if (h->n_buckets > (h->size<<1)) { \ - if (kh_resize_##name(h, h->n_buckets - 1) < 0) { /* clear "deleted" elements */ \ - *ret = -1; return h->n_buckets; \ - } \ - } else if (kh_resize_##name(h, h->n_buckets + 1) < 0) { /* expand the hash table */ \ - *ret = -1; return h->n_buckets; \ - } \ - } /* TODO: to implement automatically shrinking; resize() already support shrinking */ \ - { \ - khint_t inc, k, i, site, last, mask = h->n_buckets - 1; \ - x = site = h->n_buckets; k = __hash_func(key); i = k & mask; \ - if (__ac_isempty(h->flags, i)) x = i; /* for speed up */ \ - else { \ - inc = __ac_inc(k, mask); last = i; \ - while (!__ac_isempty(h->flags, i) && (__ac_isdel(h->flags, i) || !__hash_equal(h->keys[i], key))) { \ - if (__ac_isdel(h->flags, i)) site = i; \ - i = (i + inc) & mask; \ - if (i == last) { x = site; break; } \ - } \ - if (x == h->n_buckets) { \ - if (__ac_isempty(h->flags, i) && site != h->n_buckets) x = site; \ - else x = i; \ - } \ - } \ - } \ - if (__ac_isempty(h->flags, x)) { /* not present at all */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; ++h->n_occupied; \ - *ret = 1; \ - } else if (__ac_isdel(h->flags, x)) { /* deleted */ \ - h->keys[x] = key; \ - __ac_set_isboth_false(h->flags, x); \ - ++h->size; \ - *ret = 2; \ - } else *ret = 0; /* Don't touch h->keys[x] if present and not deleted */ \ - return x; \ - } \ - SCOPE void kh_del_##name(kh_##name##_t *h, khint_t x) \ - { \ - if (x != h->n_buckets && !__ac_iseither(h->flags, x)) { \ - __ac_set_isdel_true(h->flags, x); \ - --h->size; \ - } \ - } - -#define KHASH_DECLARE(name, khkey_t, khval_t) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_PROTOTYPES(name, khkey_t, khval_t) - -#define KHASH_INIT2(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - __KHASH_TYPE(name, khkey_t, khval_t) \ - __KHASH_IMPL(name, SCOPE, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) - -#define KHASH_INIT(name, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) \ - KHASH_INIT2(name, static kh_inline, khkey_t, khval_t, kh_is_map, __hash_func, __hash_equal) - -/* --- BEGIN OF HASH FUNCTIONS --- */ - -/*! @function - @abstract Integer hash function - @param key The integer [khint32_t] - @return The hash value [khint_t] - */ -#define kh_int_hash_func(key) (khint32_t)(key) -/*! @function - @abstract Integer comparison function - */ -#define kh_int_hash_equal(a, b) ((a) == (b)) -/*! @function - @abstract 64-bit integer hash function - @param key The integer [khint64_t] - @return The hash value [khint_t] - */ -#define kh_int64_hash_func(key) (khint32_t)((key)>>33^(key)^(key)<<11) -/*! @function - @abstract 64-bit integer comparison function - */ -#define kh_int64_hash_equal(a, b) ((a) == (b)) -/*! @function - @abstract const char* hash function - @param s Pointer to a null terminated string - @return The hash value - */ -static kh_inline khint_t __ac_X31_hash_string(const char *s) -{ - khint_t h = (khint_t)*s; - if (h) for (++s ; *s; ++s) h = (h << 5) - h + (khint_t)*s; - return h; -} -/*! @function - @abstract Another interface to const char* hash function - @param key Pointer to a null terminated string [const char*] - @return The hash value [khint_t] - */ -#define kh_str_hash_func(key) __ac_X31_hash_string(key) -/*! @function - @abstract Const char* comparison function - */ -#define kh_str_hash_equal(a, b) (strcmp(a, b) == 0) - -static kh_inline khint_t __ac_Wang_hash(khint_t key) -{ - key += ~(key << 15); - key ^= (key >> 10); - key += (key << 3); - key ^= (key >> 6); - key += ~(key << 11); - key ^= (key >> 16); - return key; -} -#define kh_int_hash_func2(k) __ac_Wang_hash((khint_t)key) - -/* --- END OF HASH FUNCTIONS --- */ - -/* Other convenient macros... */ - -/*! - @abstract Type of the hash table. - @param name Name of the hash table [symbol] - */ -#define khash_t(name) kh_##name##_t - -/*! @function - @abstract Initiate a hash table. - @param name Name of the hash table [symbol] - @return Pointer to the hash table [khash_t(name)*] - */ -#define kh_init(name) kh_init_##name() - -/*! @function - @abstract Destroy a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -#define kh_destroy(name, h) kh_destroy_##name(h) - -/*! @function - @abstract Reset a hash table without deallocating memory. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - */ -#define kh_clear(name, h) kh_clear_##name(h) - -/*! @function - @abstract Resize a hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param s New size [khint_t] - */ -#define kh_resize(name, h, s) kh_resize_##name(h, s) - -/*! @function - @abstract Insert a key to the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @param r Extra return code: 0 if the key is present in the hash table; - 1 if the bucket is empty (never used); 2 if the element in - the bucket has been deleted [int*] - @return Iterator to the inserted element [khint_t] - */ -#define kh_put(name, h, k, r) kh_put_##name(h, k, r) - -/*! @function - @abstract Retrieve a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Key [type of keys] - @return Iterator to the found element, or kh_end(h) is the element is absent [khint_t] - */ -#define kh_get(name, h, k) kh_get_##name(h, k) - -/*! @function - @abstract Remove a key from the hash table. - @param name Name of the hash table [symbol] - @param h Pointer to the hash table [khash_t(name)*] - @param k Iterator to the element to be deleted [khint_t] - */ -#define kh_del(name, h, k) kh_del_##name(h, k) - -/*! @function - @abstract Test whether a bucket contains data. - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return 1 if containing data; 0 otherwise [int] - */ -#define kh_exist(h, x) (!__ac_iseither((h)->flags, (x))) - -/*! @function - @abstract Get key given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Key [type of keys] - */ -#define kh_key(h, x) ((h)->keys[x]) - -/*! @function - @abstract Get value given an iterator - @param h Pointer to the hash table [khash_t(name)*] - @param x Iterator to the bucket [khint_t] - @return Value [type of values] - @discussion For hash sets, calling this results in segfault. - */ -#define kh_val(h, x) ((h)->vals[x]) - -/*! @function - @abstract Alias of kh_val() - */ -#define kh_value(h, x) ((h)->vals[x]) - -/*! @function - @abstract Get the start iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The start iterator [khint_t] - */ -#define kh_begin(h) (khint_t)(0) - -/*! @function - @abstract Get the end iterator - @param h Pointer to the hash table [khash_t(name)*] - @return The end iterator [khint_t] - */ -#define kh_end(h) ((h)->n_buckets) - -/*! @function - @abstract Get the number of elements in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of elements in the hash table [khint_t] - */ -#define kh_size(h) ((h)->size) - -/*! @function - @abstract Get the number of buckets in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @return Number of buckets in the hash table [khint_t] - */ -#define kh_n_buckets(h) ((h)->n_buckets) - -/*! @function - @abstract Iterate over the entries in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param kvar Variable to which key will be assigned - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ -#define kh_foreach(h, kvar, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (kvar) = kh_key(h,__i); \ - (vvar) = kh_val(h,__i); \ - code; \ - } } - -/*! @function - @abstract Iterate over the values in the hash table - @param h Pointer to the hash table [khash_t(name)*] - @param vvar Variable to which value will be assigned - @param code Block of code to execute - */ -#define kh_foreach_value(h, vvar, code) { khint_t __i; \ - for (__i = kh_begin(h); __i != kh_end(h); ++__i) { \ - if (!kh_exist(h,__i)) continue; \ - (vvar) = kh_val(h,__i); \ - code; \ - } } - -/* More conenient interfaces */ - -/*! @function - @abstract Instantiate a hash set containing integer keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_INT(name) \ - KHASH_INIT(name, khint32_t, char, 0, kh_int_hash_func, kh_int_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT(name, khval_t) \ - KHASH_INIT(name, khint32_t, khval_t, 1, kh_int_hash_func, kh_int_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing 64-bit integer keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_INT64(name) \ - KHASH_INIT(name, khint64_t, char, 0, kh_int64_hash_func, kh_int64_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing 64-bit integer keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_INT64(name, khval_t) \ - KHASH_INIT(name, khint64_t, khval_t, 1, kh_int64_hash_func, kh_int64_hash_equal) - -typedef const char *kh_cstr_t; -/*! @function - @abstract Instantiate a hash map containing const char* keys - @param name Name of the hash table [symbol] - */ -#define KHASH_SET_INIT_STR(name) \ - KHASH_INIT(name, kh_cstr_t, char, 0, kh_str_hash_func, kh_str_hash_equal) - -/*! @function - @abstract Instantiate a hash map containing const char* keys - @param name Name of the hash table [symbol] - @param khval_t Type of values [type] - */ -#define KHASH_MAP_INIT_STR(name, khval_t) \ - KHASH_INIT(name, kh_cstr_t, khval_t, 1, kh_str_hash_func, kh_str_hash_equal) - -#endif /* __AC_KHASH_H */ diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/kopen.c --- a/bwa-0.7.9a/kopen.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,374 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#ifndef _WIN32 -#include -#include -#include -#endif - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#ifdef _WIN32 -#define _KO_NO_NET -#endif - -#ifndef _KO_NO_NET -static int socket_wait(int fd, int is_read) -{ - fd_set fds, *fdr = 0, *fdw = 0; - struct timeval tv; - int ret; - tv.tv_sec = 5; tv.tv_usec = 0; // 5 seconds time out - FD_ZERO(&fds); - FD_SET(fd, &fds); - if (is_read) fdr = &fds; - else fdw = &fds; - ret = select(fd+1, fdr, fdw, 0, &tv); - if (ret == -1) perror("select"); - return ret; -} - -static int socket_connect(const char *host, const char *port) -{ -#define __err_connect(func) do { perror(func); freeaddrinfo(res); return -1; } while (0) - - int on = 1, fd; - struct linger lng = { 0, 0 }; - struct addrinfo hints, *res = 0; - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - if (getaddrinfo(host, port, &hints, &res) != 0) __err_connect("getaddrinfo"); - if ((fd = socket(res->ai_family, res->ai_socktype, res->ai_protocol)) == -1) __err_connect("socket"); - if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)) == -1) __err_connect("setsockopt"); - if (setsockopt(fd, SOL_SOCKET, SO_LINGER, &lng, sizeof(lng)) == -1) __err_connect("setsockopt"); - if (connect(fd, res->ai_addr, res->ai_addrlen) != 0) __err_connect("connect"); - freeaddrinfo(res); - return fd; -#undef __err_connect -} - -static int write_bytes(int fd, const char *buf, size_t len) -{ - ssize_t bytes; - do { - bytes = write(fd, buf, len); - if (bytes >= 0) { - len -= bytes; - } else if (errno != EAGAIN && errno != EWOULDBLOCK && errno != EINTR) { - return -1; - } - } while (len > 0); - - return 0; -} - -static int http_open(const char *fn) -{ - char *p, *proxy, *q, *http_host, *host, *port, *path, *buf; - int fd, ret, l; - ssize_t bytes = 0, bufsz = 0x10000; - - /* parse URL; adapted from khttp_parse_url() in knetfile.c */ - if (strstr(fn, "http://") != fn) return 0; - // set ->http_host - for (p = (char*)fn + 7; *p && *p != '/'; ++p); - l = p - fn - 7; - http_host = calloc(l + 1, 1); - strncpy(http_host, fn + 7, l); - http_host[l] = 0; - for (q = http_host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - // get http_proxy - proxy = getenv("http_proxy"); - // set host, port and path - if (proxy == 0) { - host = strdup(http_host); // when there is no proxy, server name is identical to http_host name. - port = strdup(*q? q : "80"); - path = strdup(*p? p : "/"); - } else { - host = (strstr(proxy, "http://") == proxy)? strdup(proxy + 7) : strdup(proxy); - for (q = host; *q && *q != ':'; ++q); - if (*q == ':') *q++ = 0; - port = strdup(*q? q : "80"); - path = strdup(fn); - } - - /* connect; adapted from khttp_connect() in knetfile.c */ - l = 0; - fd = socket_connect(host, port); - buf = calloc(bufsz, 1); // FIXME: I am lazy... But in principle, 64KB should be large enough. - l += snprintf(buf + l, bufsz, "GET %s HTTP/1.0\r\nHost: %s\r\n\r\n", - path, http_host); - if (write_bytes(fd, buf, l) != 0) { - close(fd); - fd = -1; - goto out; - } - l = 0; - retry: - while (l < bufsz && (bytes = read(fd, buf + l, 1)) > 0) { // read HTTP header; FIXME: bad efficiency - if (buf[l] == '\n' && l >= 3) - if (strncmp(buf + l - 3, "\r\n\r\n", 4) == 0) break; - ++l; - } - if (bytes < 0 && (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR)) goto retry; - - buf[l] = 0; - if (bytes < 0 || l < 14) { // prematured header - close(fd); - fd = -1; - goto out; - } - ret = strtol(buf + 8, &p, 0); // HTTP return code - if (ret != 200) { - close(fd); - fd = -1; - } - out: - free(buf); free(http_host); free(host); free(port); free(path); - return fd; -} - -typedef struct { - int max_response, ctrl_fd; - char *response; -} ftpaux_t; - -static int kftp_get_response(ftpaux_t *aux) -{ - unsigned char c; - int n = 0; - char *p; - if (socket_wait(aux->ctrl_fd, 1) <= 0) return 0; - while (read(aux->ctrl_fd, &c, 1)) { // FIXME: this is *VERY BAD* for unbuffered I/O - if (n >= aux->max_response) { - aux->max_response = aux->max_response? aux->max_response<<1 : 256; - aux->response = realloc(aux->response, aux->max_response); - } - aux->response[n++] = c; - if (c == '\n') { - if (n >= 4 && isdigit(aux->response[0]) && isdigit(aux->response[1]) && isdigit(aux->response[2]) - && aux->response[3] != '-') break; - n = 0; - continue; - } - } - if (n < 2) return -1; - aux->response[n-2] = 0; - return strtol(aux->response, &p, 0); -} - -static int kftp_send_cmd(ftpaux_t *aux, const char *cmd, int is_get) -{ - if (socket_wait(aux->ctrl_fd, 0) <= 0) return -1; // socket is not ready for writing - if (write_bytes(aux->ctrl_fd, cmd, strlen(cmd)) != 0) return -1; - return is_get? kftp_get_response(aux) : 0; -} - -static int ftp_open(const char *fn) -{ - char *p, *host = 0, *port = 0, *retr = 0; - char host2[80], port2[10]; - int v[6], l, fd = -1, ret, pasv_port, pasv_ip[4]; - ftpaux_t aux; - - /* parse URL */ - if (strstr(fn, "ftp://") != fn) return 0; - for (p = (char*)fn + 6; *p && *p != '/'; ++p); - if (*p != '/') return 0; - l = p - fn - 6; - port = strdup("21"); - host = calloc(l + 1, 1); - strncpy(host, fn + 6, l); - retr = calloc(strlen(p) + 8, 1); - sprintf(retr, "RETR %s\r\n", p); - - /* connect to ctrl */ - memset(&aux, 0, sizeof(ftpaux_t)); - aux.ctrl_fd = socket_connect(host, port); - if (aux.ctrl_fd == -1) goto ftp_open_end; /* fail to connect ctrl */ - - /* connect to the data stream */ - kftp_get_response(&aux); - kftp_send_cmd(&aux, "USER anonymous\r\n", 1); - kftp_send_cmd(&aux, "PASS kopen@\r\n", 1); - kftp_send_cmd(&aux, "TYPE I\r\n", 1); - kftp_send_cmd(&aux, "PASV\r\n", 1); - for (p = aux.response; *p && *p != '('; ++p); - if (*p != '(') goto ftp_open_end; - ++p; - sscanf(p, "%d,%d,%d,%d,%d,%d", &v[0], &v[1], &v[2], &v[3], &v[4], &v[5]); - memcpy(pasv_ip, v, 4 * sizeof(int)); - pasv_port = (v[4]<<8&0xff00) + v[5]; - kftp_send_cmd(&aux, retr, 0); - sprintf(host2, "%d.%d.%d.%d", pasv_ip[0], pasv_ip[1], pasv_ip[2], pasv_ip[3]); - sprintf(port2, "%d", pasv_port); - fd = socket_connect(host2, port2); - if (fd == -1) goto ftp_open_end; - ret = kftp_get_response(&aux); - if (ret != 150) { - close(fd); - fd = -1; - } - close(aux.ctrl_fd); - -ftp_open_end: - free(host); free(port); free(retr); free(aux.response); - return fd; -} -#endif /* !defined(_KO_NO_NET) */ - -static char **cmd2argv(const char *cmd) -{ - int i, beg, end, argc; - char **argv, *str; - end = strlen(cmd); - for (i = end - 1; i >= 0; --i) - if (!isspace(cmd[i])) break; - end = i + 1; - for (beg = 0; beg < end; ++beg) - if (!isspace(cmd[beg])) break; - if (beg == end) return 0; - for (i = beg + 1, argc = 0; i < end; ++i) - if (isspace(cmd[i]) && !isspace(cmd[i-1])) - ++argc; - argv = (char**)calloc(argc + 2, sizeof(void*)); - argv[0] = str = (char*)calloc(end - beg + 1, 1); - strncpy(argv[0], cmd + beg, end - beg); - for (i = argc = 1; i < end - beg; ++i) - if (isspace(str[i])) str[i] = 0; - else if (str[i] && str[i-1] == 0) argv[argc++] = &str[i]; - return argv; -} - -#define KO_STDIN 1 -#define KO_FILE 2 -#define KO_PIPE 3 -#define KO_HTTP 4 -#define KO_FTP 5 - -typedef struct { - int type, fd; - pid_t pid; -} koaux_t; - -void *kopen(const char *fn, int *_fd) -{ - koaux_t *aux = 0; - *_fd = -1; - if (strstr(fn, "http://") == fn) { - aux = calloc(1, sizeof(koaux_t)); - aux->type = KO_HTTP; - aux->fd = http_open(fn); - } else if (strstr(fn, "ftp://") == fn) { - aux = calloc(1, sizeof(koaux_t)); - aux->type = KO_FTP; - aux->fd = ftp_open(fn); - } else if (strcmp(fn, "-") == 0) { - aux = calloc(1, sizeof(koaux_t)); - aux->type = KO_STDIN; - aux->fd = STDIN_FILENO; - } else { - const char *p, *q; - for (p = fn; *p; ++p) - if (!isspace(*p)) break; - if (*p == '<') { // pipe open - int need_shell, pfd[2]; - pid_t pid; - // a simple check to see if we need to invoke a shell; not always working - for (q = p + 1; *q; ++q) - if (ispunct(*q) && *q != '.' && *q != '_' && *q != '-' && *q != ':') - break; - need_shell = (*q != 0); - if (pipe(pfd) != 0) return 0; - pid = vfork(); - if (pid == -1) { /* vfork() error */ - close(pfd[0]); close(pfd[1]); - return 0; - } - if (pid == 0) { /* the child process */ - char **argv; /* FIXME: I do not know if this will lead to a memory leak */ - close(pfd[0]); - dup2(pfd[1], STDOUT_FILENO); - close(pfd[1]); - if (!need_shell) { - argv = cmd2argv(p + 1); - execvp(argv[0], argv); - free(argv[0]); free(argv); - } else execl("/bin/sh", "sh", "-c", p + 1, NULL); - exit(1); - } else { /* parent process */ - close(pfd[1]); - aux = calloc(1, sizeof(koaux_t)); - aux->type = KO_PIPE; - aux->fd = pfd[0]; - aux->pid = pid; - } - } else { -#ifdef _WIN32 - *_fd = open(fn, O_RDONLY | O_BINARY); -#else - *_fd = open(fn, O_RDONLY); -#endif - if (*_fd >= 0) { - aux = calloc(1, sizeof(koaux_t)); - aux->type = KO_FILE; - aux->fd = *_fd; - } - } - } - if (aux) *_fd = aux->fd; - return aux; -} - -int kclose(void *a) -{ - koaux_t *aux = (koaux_t*)a; - if (aux->type == KO_PIPE) { - int status; - pid_t pid; - pid = waitpid(aux->pid, &status, WNOHANG); - if (pid != aux->pid) kill(aux->pid, 15); - } - free(aux); - return 0; -} - -#ifdef _KO_MAIN -#define BUF_SIZE 0x10000 -int main(int argc, char *argv[]) -{ - void *x; - int l, fd; - unsigned char buf[BUF_SIZE]; - FILE *fp; - if (argc == 1) { - fprintf(stderr, "Usage: kopen \n"); - return 1; - } - x = kopen(argv[1], &fd); - fp = fdopen(fd, "r"); - if (fp == 0) { - fprintf(stderr, "ERROR: fail to open the input\n"); - return 1; - } - do { - if ((l = fread(buf, 1, BUF_SIZE, fp)) != 0) - fwrite(buf, 1, l, stdout); - } while (l == BUF_SIZE); - fclose(fp); - kclose(x); - return 0; -} -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/kseq.h --- a/bwa-0.7.9a/kseq.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,239 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, 2009, 2011 Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Last Modified: 05MAR2012 */ - -#ifndef AC_KSEQ_H -#define AC_KSEQ_H - -#include -#include -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r -#define KS_SEP_TAB 1 // isspace() && !' ' -#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows) -#define KS_SEP_MAX 2 - -#define __KS_TYPE(type_t) \ - typedef struct __kstream_t { \ - unsigned char *buf; \ - int begin, end, is_eof; \ - type_t f; \ - } kstream_t; - -#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end) -#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0) - -#define __KS_BASIC(type_t, __bufsize) \ - static inline kstream_t *ks_init(type_t f) \ - { \ - kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \ - ks->f = f; \ - ks->buf = (unsigned char*)malloc(__bufsize); \ - return ks; \ - } \ - static inline void ks_destroy(kstream_t *ks) \ - { \ - if (ks) { \ - free(ks->buf); \ - free(ks); \ - } \ - } - -#define __KS_GETC(__read, __bufsize) \ - static inline int ks_getc(kstream_t *ks) \ - { \ - if (ks->is_eof && ks->begin >= ks->end) return -1; \ - if (ks->begin >= ks->end) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) return -1; \ - } \ - return (int)ks->buf[ks->begin++]; \ - } - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#define __KS_GETUNTIL(__read, __bufsize) \ - static int ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \ - { \ - if (dret) *dret = 0; \ - str->l = append? str->l : 0; \ - if (ks->begin >= ks->end && ks->is_eof) return -1; \ - for (;;) { \ - int i; \ - if (ks->begin >= ks->end) { \ - if (!ks->is_eof) { \ - ks->begin = 0; \ - ks->end = __read(ks->f, ks->buf, __bufsize); \ - if (ks->end < __bufsize) ks->is_eof = 1; \ - if (ks->end == 0) break; \ - } else break; \ - } \ - if (delimiter == KS_SEP_LINE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == '\n') break; \ - } else if (delimiter > KS_SEP_MAX) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (ks->buf[i] == delimiter) break; \ - } else if (delimiter == KS_SEP_SPACE) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i])) break; \ - } else if (delimiter == KS_SEP_TAB) { \ - for (i = ks->begin; i < ks->end; ++i) \ - if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \ - } else i = 0; /* never come to here! */ \ - if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \ - str->m = str->l + (i - ks->begin) + 1; \ - kroundup32(str->m); \ - str->s = (char*)realloc(str->s, str->m); \ - } \ - memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \ - str->l = str->l + (i - ks->begin); \ - ks->begin = i + 1; \ - if (i < ks->end) { \ - if (dret) *dret = ks->buf[i]; \ - break; \ - } \ - } \ - if (str->s == 0) { \ - str->m = 1; \ - str->s = (char*)calloc(1, 1); \ - } else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \ - str->s[str->l] = '\0'; \ - return str->l; \ - } \ - static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \ - { return ks_getuntil2(ks, delimiter, str, dret, 0); } - -#define KSTREAM_INIT(type_t, __read, __bufsize) \ - __KS_TYPE(type_t) \ - __KS_BASIC(type_t, __bufsize) \ - __KS_GETC(__read, __bufsize) \ - __KS_GETUNTIL(__read, __bufsize) - -#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0) - -#define __KSEQ_BASIC(SCOPE, type_t) \ - SCOPE kseq_t *kseq_init(type_t fd) \ - { \ - kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \ - s->f = ks_init(fd); \ - return s; \ - } \ - SCOPE void kseq_destroy(kseq_t *ks) \ - { \ - if (!ks) return; \ - free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \ - ks_destroy(ks->f); \ - free(ks); \ - } - -/* Return value: - >=0 length of the sequence (normal) - -1 end-of-file - -2 truncated quality string - */ -#define __KSEQ_READ(SCOPE) \ - SCOPE int kseq_read(kseq_t *seq) \ - { \ - int c; \ - kstream_t *ks = seq->f; \ - if (seq->last_char == 0) { /* then jump to the next header line */ \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '@'); \ - if (c == -1) return -1; /* end of file */ \ - seq->last_char = c; \ - } /* else: the first header char has been read in the previous call */ \ - seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \ - if (ks_getuntil(ks, 0, &seq->name, &c) < 0) return -1; /* normal exit: EOF */ \ - if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \ - if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \ - seq->seq.m = 256; \ - seq->seq.s = (char*)malloc(seq->seq.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '>' && c != '+' && c != '@') { \ - if (c == '\n') continue; /* skip empty lines */ \ - seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \ - ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \ - } \ - if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \ - if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \ - seq->seq.m = seq->seq.l + 2; \ - kroundup32(seq->seq.m); /* rounded to the next closest 2^k */ \ - seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \ - } \ - seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \ - if (c != '+') return seq->seq.l; /* FASTA */ \ - if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \ - seq->qual.m = seq->seq.m; \ - seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \ - } \ - while ((c = ks_getc(ks)) != -1 && c != '\n'); /* skip the rest of '+' line */ \ - if (c == -1) return -2; /* error: no quality string */ \ - while (ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l); \ - seq->last_char = 0; /* we have not come to the next header line */ \ - if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \ - return seq->seq.l; \ - } - -#define __KSEQ_TYPE(type_t) \ - typedef struct { \ - kstring_t name, comment, seq, qual; \ - int last_char; \ - kstream_t *f; \ - } kseq_t; - -#define KSEQ_INIT2(SCOPE, type_t, __read) \ - KSTREAM_INIT(type_t, __read, 16384) \ - __KSEQ_TYPE(type_t) \ - __KSEQ_BASIC(SCOPE, type_t) \ - __KSEQ_READ(SCOPE) - -#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read) - -#define KSEQ_DECLARE(type_t) \ - __KS_TYPE(type_t) \ - __KSEQ_TYPE(type_t) \ - extern kseq_t *kseq_init(type_t fd); \ - void kseq_destroy(kseq_t *ks); \ - int kseq_read(kseq_t *seq); - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/ksort.h --- a/bwa-0.7.9a/ksort.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,273 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - 2008-11-16 (0.1.4): - - * Fixed a bug in introsort() that happens in rare cases. - - 2008-11-05 (0.1.3): - - * Fixed a bug in introsort() for complex comparisons. - - * Fixed a bug in mergesort(). The previous version is not stable. - - 2008-09-15 (0.1.2): - - * Accelerated introsort. On my Mac (not on another Linux machine), - my implementation is as fast as std::sort on random input. - - * Added combsort and in introsort, switch to combsort if the - recursion is too deep. - - 2008-09-13 (0.1.1): - - * Added k-small algorithm - - 2008-09-05 (0.1.0): - - * Initial version - -*/ - -#ifndef AC_KSORT_H -#define AC_KSORT_H - -#include -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -typedef struct { - void *left, *right; - int depth; -} ks_isort_stack_t; - -#define KSORT_SWAP(type_t, a, b) { register type_t t=(a); (a)=(b); (b)=t; } - -#define KSORT_INIT(name, type_t, __sort_lt) \ - void ks_mergesort_##name(size_t n, type_t array[], type_t temp[]) \ - { \ - type_t *a2[2], *a, *b; \ - int curr, shift; \ - \ - a2[0] = array; \ - a2[1] = temp? temp : (type_t*)malloc(sizeof(type_t) * n); \ - for (curr = 0, shift = 0; (1ul<> 1) - 1; i != (size_t)(-1); --i) \ - ks_heapadjust_##name(i, lsize, l); \ - } \ - void ks_heapsort_##name(size_t lsize, type_t l[]) \ - { \ - size_t i; \ - for (i = lsize - 1; i > 0; --i) { \ - type_t tmp; \ - tmp = *l; *l = l[i]; l[i] = tmp; ks_heapadjust_##name(0, i, l); \ - } \ - } \ - static inline void __ks_insertsort_##name(type_t *s, type_t *t) \ - { \ - type_t *i, *j, swap_tmp; \ - for (i = s + 1; i < t; ++i) \ - for (j = i; j > s && __sort_lt(*j, *(j-1)); --j) { \ - swap_tmp = *j; *j = *(j-1); *(j-1) = swap_tmp; \ - } \ - } \ - void ks_combsort_##name(size_t n, type_t a[]) \ - { \ - const double shrink_factor = 1.2473309501039786540366528676643; \ - int do_swap; \ - size_t gap = n; \ - type_t tmp, *i, *j; \ - do { \ - if (gap > 2) { \ - gap = (size_t)(gap / shrink_factor); \ - if (gap == 9 || gap == 10) gap = 11; \ - } \ - do_swap = 0; \ - for (i = a; i < a + n - gap; ++i) { \ - j = i + gap; \ - if (__sort_lt(*j, *i)) { \ - tmp = *i; *i = *j; *j = tmp; \ - do_swap = 1; \ - } \ - } \ - } while (do_swap || gap > 2); \ - if (gap != 1) __ks_insertsort_##name(a, a + n); \ - } \ - void ks_introsort_##name(size_t n, type_t a[]) \ - { \ - int d; \ - ks_isort_stack_t *top, *stack; \ - type_t rp, swap_tmp; \ - type_t *s, *t, *i, *j, *k; \ - \ - if (n < 1) return; \ - else if (n == 2) { \ - if (__sort_lt(a[1], a[0])) { swap_tmp = a[0]; a[0] = a[1]; a[1] = swap_tmp; } \ - return; \ - } \ - for (d = 2; 1ul<>1) + 1; \ - if (__sort_lt(*k, *i)) { \ - if (__sort_lt(*k, *j)) k = j; \ - } else k = __sort_lt(*j, *i)? i : j; \ - rp = *k; \ - if (k != t) { swap_tmp = *k; *k = *t; *t = swap_tmp; } \ - for (;;) { \ - do ++i; while (__sort_lt(*i, rp)); \ - do --j; while (i <= j && __sort_lt(rp, *j)); \ - if (j <= i) break; \ - swap_tmp = *i; *i = *j; *j = swap_tmp; \ - } \ - swap_tmp = *i; *i = *t; *t = swap_tmp; \ - if (i-s > t-i) { \ - if (i-s > 16) { top->left = s; top->right = i-1; top->depth = d; ++top; } \ - s = t-i > 16? i+1 : t; \ - } else { \ - if (t-i > 16) { top->left = i+1; top->right = t; top->depth = d; ++top; } \ - t = i-s > 16? i-1 : s; \ - } \ - } else { \ - if (top == stack) { \ - free(stack); \ - __ks_insertsort_##name(a, a+n); \ - return; \ - } else { --top; s = (type_t*)top->left; t = (type_t*)top->right; d = top->depth; } \ - } \ - } \ - } \ - /* This function is adapted from: http://ndevilla.free.fr/median/ */ \ - /* 0 <= kk < n */ \ - type_t ks_ksmall_##name(size_t n, type_t arr[], size_t kk) \ - { \ - type_t *low, *high, *k, *ll, *hh, *mid; \ - low = arr; high = arr + n - 1; k = arr + kk; \ - for (;;) { \ - if (high <= low) return *k; \ - if (high == low + 1) { \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - return *k; \ - } \ - mid = low + (high - low) / 2; \ - if (__sort_lt(*high, *mid)) KSORT_SWAP(type_t, *mid, *high); \ - if (__sort_lt(*high, *low)) KSORT_SWAP(type_t, *low, *high); \ - if (__sort_lt(*low, *mid)) KSORT_SWAP(type_t, *mid, *low); \ - KSORT_SWAP(type_t, *mid, *(low+1)); \ - ll = low + 1; hh = high; \ - for (;;) { \ - do ++ll; while (__sort_lt(*ll, *low)); \ - do --hh; while (__sort_lt(*low, *hh)); \ - if (hh < ll) break; \ - KSORT_SWAP(type_t, *ll, *hh); \ - } \ - KSORT_SWAP(type_t, *low, *hh); \ - if (hh <= k) low = ll; \ - if (hh >= k) high = hh - 1; \ - } \ - } - -#define ks_mergesort(name, n, a, t) ks_mergesort_##name(n, a, t) -#define ks_introsort(name, n, a) ks_introsort_##name(n, a) -#define ks_combsort(name, n, a) ks_combsort_##name(n, a) -#define ks_heapsort(name, n, a) ks_heapsort_##name(n, a) -#define ks_heapmake(name, n, a) ks_heapmake_##name(n, a) -#define ks_heapadjust(name, i, n, a) ks_heapadjust_##name(i, n, a) -#define ks_ksmall(name, n, a, k) ks_ksmall_##name(n, a, k) - -#define ks_lt_generic(a, b) ((a) < (b)) -#define ks_lt_str(a, b) (strcmp((a), (b)) < 0) - -typedef const char *ksstr_t; - -#define KSORT_INIT_GENERIC(type_t) KSORT_INIT(type_t, type_t, ks_lt_generic) -#define KSORT_INIT_STR KSORT_INIT(str, ksstr_t, ks_lt_str) - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/kstring.c --- a/bwa-0.7.9a/kstring.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,39 +0,0 @@ -#include -#include -#include "kstring.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -int ksprintf(kstring_t *s, const char *fmt, ...) -{ - va_list ap; - int l; - va_start(ap, fmt); - l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); - va_end(ap); - if (l + 1 > s->m - s->l) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - va_start(ap, fmt); - l = vsnprintf(s->s + s->l, s->m - s->l, fmt, ap); - } - va_end(ap); - s->l += l; - return l; -} - -#ifdef KSTRING_MAIN -#include -int main() -{ - kstring_t *s; - s = (kstring_t*)calloc(1, sizeof(kstring_t)); - ksprintf(s, "abcdefg: %d", 100); - printf("%s\n", s->s); - free(s); - return 0; -} -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/kstring.h --- a/bwa-0.7.9a/kstring.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,115 +0,0 @@ -#ifndef KSTRING_H -#define KSTRING_H - -#include -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#ifndef kroundup32 -#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) -#endif - -#ifndef KSTRING_T -#define KSTRING_T kstring_t -typedef struct __kstring_t { - size_t l, m; - char *s; -} kstring_t; -#endif - -static inline void ks_resize(kstring_t *s, size_t size) -{ - if (s->m < size) { - s->m = size; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } -} - -static inline int kputsn(const char *p, int l, kstring_t *s) -{ - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - memcpy(s->s + s->l, p, l); - s->l += l; - s->s[s->l] = 0; - return l; -} - -static inline int kputs(const char *p, kstring_t *s) -{ - return kputsn(p, strlen(p), s); -} - -static inline int kputc(int c, kstring_t *s) -{ - if (s->l + 1 >= s->m) { - s->m = s->l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - s->s[s->l++] = c; - s->s[s->l] = 0; - return c; -} - -static inline int kputw(int c, kstring_t *s) -{ - char buf[16]; - int l, x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; - s->s[s->l] = 0; - return 0; -} - -static inline int kputuw(unsigned c, kstring_t *s) -{ - char buf[16]; - int l, i; - unsigned x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (i = l - 1; i >= 0; --i) s->s[s->l++] = buf[i]; - s->s[s->l] = 0; - return 0; -} - -static inline int kputl(long c, kstring_t *s) -{ - char buf[32]; - long l, x; - if (c == 0) return kputc('0', s); - for (l = 0, x = c < 0? -c : c; x > 0; x /= 10) buf[l++] = x%10 + '0'; - if (c < 0) buf[l++] = '-'; - if (s->l + l + 1 >= s->m) { - s->m = s->l + l + 2; - kroundup32(s->m); - s->s = (char*)realloc(s->s, s->m); - } - for (x = l - 1; x >= 0; --x) s->s[s->l++] = buf[x]; - s->s[s->l] = 0; - return 0; -} - -int ksprintf(kstring_t *s, const char *fmt, ...); - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/ksw.c --- a/bwa-0.7.9a/ksw.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,713 +0,0 @@ -/* The MIT License - - Copyright (c) 2011 by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -#include -#include -#include -#include -#include "ksw.h" - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#ifdef __GNUC__ -#define LIKELY(x) __builtin_expect((x),1) -#define UNLIKELY(x) __builtin_expect((x),0) -#else -#define LIKELY(x) (x) -#define UNLIKELY(x) (x) -#endif - -const kswr_t g_defr = { 0, -1, -1, -1, -1, -1, -1 }; - -struct _kswq_t { - int qlen, slen; - uint8_t shift, mdiff, max, size; - __m128i *qp, *H0, *H1, *E, *Hmax; -}; - -/** - * Initialize the query data structure - * - * @param size Number of bytes used to store a score; valid valures are 1 or 2 - * @param qlen Length of the query sequence - * @param query Query sequence - * @param m Size of the alphabet - * @param mat Scoring matrix in a one-dimension array - * - * @return Query data structure - */ -kswq_t *ksw_qinit(int size, int qlen, const uint8_t *query, int m, const int8_t *mat) -{ - kswq_t *q; - int slen, a, tmp, p; - - size = size > 1? 2 : 1; - p = 8 * (3 - size); // # values per __m128i - slen = (qlen + p - 1) / p; // segmented length - q = (kswq_t*)malloc(sizeof(kswq_t) + 256 + 16 * slen * (m + 4)); // a single block of memory - q->qp = (__m128i*)(((size_t)q + sizeof(kswq_t) + 15) >> 4 << 4); // align memory - q->H0 = q->qp + slen * m; - q->H1 = q->H0 + slen; - q->E = q->H1 + slen; - q->Hmax = q->E + slen; - q->slen = slen; q->qlen = qlen; q->size = size; - // compute shift - tmp = m * m; - for (a = 0, q->shift = 127, q->mdiff = 0; a < tmp; ++a) { // find the minimum and maximum score - if (mat[a] < (int8_t)q->shift) q->shift = mat[a]; - if (mat[a] > (int8_t)q->mdiff) q->mdiff = mat[a]; - } - q->max = q->mdiff; - q->shift = 256 - q->shift; // NB: q->shift is uint8_t - q->mdiff += q->shift; // this is the difference between the min and max scores - // An example: p=8, qlen=19, slen=3 and segmentation: - // {{0,3,6,9,12,15,18,-1},{1,4,7,10,13,16,-1,-1},{2,5,8,11,14,17,-1,-1}} - if (size == 1) { - int8_t *t = (int8_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]) + q->shift; - } - } else { - int16_t *t = (int16_t*)q->qp; - for (a = 0; a < m; ++a) { - int i, k, nlen = slen * p; - const int8_t *ma = mat + a * m; - for (i = 0; i < slen; ++i) - for (k = i; k < nlen; k += slen) // p iterations - *t++ = (k >= qlen? 0 : ma[query[k]]); - } - } - return q; -} - -kswr_t ksw_u8(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) -{ - int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; - uint64_t *b; - __m128i zero, oe_del, e_del, oe_ins, e_ins, shift, *H0, *H1, *E, *Hmax; - kswr_t r; - -#define __max_16(ret, xx) do { \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 2)); \ - (xx) = _mm_max_epu8((xx), _mm_srli_si128((xx), 1)); \ - (ret) = _mm_extract_epi16((xx), 0) & 0x00ff; \ - } while (0) - - // initialization - r = g_defr; - minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; - endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; - m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); - oe_del = _mm_set1_epi8(_o_del + _e_del); - e_del = _mm_set1_epi8(_e_del); - oe_ins = _mm_set1_epi8(_o_ins + _e_ins); - e_ins = _mm_set1_epi8(_e_ins); - shift = _mm_set1_epi8(q->shift); - H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; - slen = q->slen; - for (i = 0; i < slen; ++i) { - _mm_store_si128(E + i, zero); - _mm_store_si128(H0 + i, zero); - _mm_store_si128(Hmax + i, zero); - } - // the core loop - for (i = 0; i < tlen; ++i) { - int j, k, cmp, imax; - __m128i e, h, t, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector - h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 1); // h=H(i-1,-1); << instead of >> because x64 is little-endian - for (j = 0; LIKELY(j < slen); ++j) { - /* SW cells are computed in the following order: - * H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} - * E(i+1,j) = max{H(i,j)-q, E(i,j)-r} - * F(i,j+1) = max{H(i,j)-q, F(i,j)-r} - */ - // compute H'(i,j); note that at the beginning, h=H'(i-1,j-1) - h = _mm_adds_epu8(h, _mm_load_si128(S + j)); - h = _mm_subs_epu8(h, shift); // h=H'(i-1,j-1)+S(i,j) - e = _mm_load_si128(E + j); // e=E'(i,j) - h = _mm_max_epu8(h, e); - h = _mm_max_epu8(h, f); // h=H'(i,j) - max = _mm_max_epu8(max, h); // set max - _mm_store_si128(H1 + j, h); // save to H'(i,j) - // now compute E'(i+1,j) - e = _mm_subs_epu8(e, e_del); // e=E'(i,j) - e_del - t = _mm_subs_epu8(h, oe_del); // h=H'(i,j) - o_del - e_del - e = _mm_max_epu8(e, t); // e=E'(i+1,j) - _mm_store_si128(E + j, e); // save to E'(i+1,j) - // now compute F'(i,j+1) - f = _mm_subs_epu8(f, e_ins); - t = _mm_subs_epu8(h, oe_ins); // h=H'(i,j) - o_ins - e_ins - f = _mm_max_epu8(f, t); - // get H'(i-1,j) and prepare for the next j - h = _mm_load_si128(H0 + j); // h=H'(i-1,j) - } - // NB: we do not need to set E(i,j) as we disallow adjecent insertion and then deletion - for (k = 0; LIKELY(k < 16); ++k) { // this block mimics SWPS3; NB: H(i,j) updated in the lazy-F loop cannot exceed max - f = _mm_slli_si128(f, 1); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_load_si128(H1 + j); - h = _mm_max_epu8(h, f); // h=H'(i,j) - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu8(h, oe_ins); - f = _mm_subs_epu8(f, e_ins); - cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(_mm_subs_epu8(f, h), zero)); - if (UNLIKELY(cmp == 0xffff)) goto end_loop16; - } - } -end_loop16: - //int k;for (k=0;k<16;++k)printf("%d ", ((uint8_t*)&max)[k]);printf("\n"); - __max_16(imax, max); // imax is the maximum number in max - if (imax >= minsc) { // write the b array; this condition adds branching unfornately - if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { // then append - if (n_b == m_b) { - m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)realloc(b, 8 * m_b); - } - b[n_b++] = (uint64_t)imax<<32 | i; - } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last - } - if (imax > gmax) { - gmax = imax; te = i; // te is the end position on the target - for (j = 0; LIKELY(j < slen); ++j) // keep the H1 vector - _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax + q->shift >= 255 || gmax >= endsc) break; - } - S = H1; H1 = H0; H0 = S; // swap H0 and H1 - } - r.score = gmax + q->shift < 255? gmax : 255; - r.te = te; - if (r.score != 255) { // get a->qe, the end of query match; find the 2nd best score - int max = -1, tmp, low, high, qlen = slen * 16; - uint8_t *t = (uint8_t*)Hmax; - for (i = 0; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, r.qe = i / 16 + i % 16 * slen; - else if ((int)*t == max && (tmp = i / 16 + i % 16 * slen) < r.qe) r.qe = tmp; - //printf("%d,%d\n", max, gmax); - if (b) { - i = (r.score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) - r.score2 = b[i]>>32, r.te2 = e; - } - } - } - free(b); - return r; -} - -kswr_t ksw_i16(kswq_t *q, int tlen, const uint8_t *target, int _o_del, int _e_del, int _o_ins, int _e_ins, int xtra) // the first gap costs -(_o+_e) -{ - int slen, i, m_b, n_b, te = -1, gmax = 0, minsc, endsc; - uint64_t *b; - __m128i zero, oe_del, e_del, oe_ins, e_ins, *H0, *H1, *E, *Hmax; - kswr_t r; - -#define __max_8(ret, xx) do { \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 8)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 4)); \ - (xx) = _mm_max_epi16((xx), _mm_srli_si128((xx), 2)); \ - (ret) = _mm_extract_epi16((xx), 0); \ - } while (0) - - // initialization - r = g_defr; - minsc = (xtra&KSW_XSUBO)? xtra&0xffff : 0x10000; - endsc = (xtra&KSW_XSTOP)? xtra&0xffff : 0x10000; - m_b = n_b = 0; b = 0; - zero = _mm_set1_epi32(0); - oe_del = _mm_set1_epi16(_o_del + _e_del); - e_del = _mm_set1_epi16(_e_del); - oe_ins = _mm_set1_epi16(_o_ins + _e_ins); - e_ins = _mm_set1_epi16(_e_ins); - H0 = q->H0; H1 = q->H1; E = q->E; Hmax = q->Hmax; - slen = q->slen; - for (i = 0; i < slen; ++i) { - _mm_store_si128(E + i, zero); - _mm_store_si128(H0 + i, zero); - _mm_store_si128(Hmax + i, zero); - } - // the core loop - for (i = 0; i < tlen; ++i) { - int j, k, imax; - __m128i e, t, h, f = zero, max = zero, *S = q->qp + target[i] * slen; // s is the 1st score vector - h = _mm_load_si128(H0 + slen - 1); // h={2,5,8,11,14,17,-1,-1} in the above example - h = _mm_slli_si128(h, 2); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_adds_epi16(h, *S++); - e = _mm_load_si128(E + j); - h = _mm_max_epi16(h, e); - h = _mm_max_epi16(h, f); - max = _mm_max_epi16(max, h); - _mm_store_si128(H1 + j, h); - e = _mm_subs_epu16(e, e_del); - t = _mm_subs_epu16(h, oe_del); - e = _mm_max_epi16(e, t); - _mm_store_si128(E + j, e); - f = _mm_subs_epu16(f, e_ins); - t = _mm_subs_epu16(h, oe_ins); - f = _mm_max_epi16(f, t); - h = _mm_load_si128(H0 + j); - } - for (k = 0; LIKELY(k < 16); ++k) { - f = _mm_slli_si128(f, 2); - for (j = 0; LIKELY(j < slen); ++j) { - h = _mm_load_si128(H1 + j); - h = _mm_max_epi16(h, f); - _mm_store_si128(H1 + j, h); - h = _mm_subs_epu16(h, oe_ins); - f = _mm_subs_epu16(f, e_ins); - if(UNLIKELY(!_mm_movemask_epi8(_mm_cmpgt_epi16(f, h)))) goto end_loop8; - } - } -end_loop8: - __max_8(imax, max); - if (imax >= minsc) { - if (n_b == 0 || (int32_t)b[n_b-1] + 1 != i) { - if (n_b == m_b) { - m_b = m_b? m_b<<1 : 8; - b = (uint64_t*)realloc(b, 8 * m_b); - } - b[n_b++] = (uint64_t)imax<<32 | i; - } else if ((int)(b[n_b-1]>>32) < imax) b[n_b-1] = (uint64_t)imax<<32 | i; // modify the last - } - if (imax > gmax) { - gmax = imax; te = i; - for (j = 0; LIKELY(j < slen); ++j) - _mm_store_si128(Hmax + j, _mm_load_si128(H1 + j)); - if (gmax >= endsc) break; - } - S = H1; H1 = H0; H0 = S; - } - r.score = gmax; r.te = te; - { - int max = -1, tmp, low, high, qlen = slen * 8; - uint16_t *t = (uint16_t*)Hmax; - for (i = 0, r.qe = -1; i < qlen; ++i, ++t) - if ((int)*t > max) max = *t, r.qe = i / 8 + i % 8 * slen; - else if ((int)*t == max && (tmp = i / 8 + i % 8 * slen) < r.qe) r.qe = tmp; - if (b) { - i = (r.score + q->max - 1) / q->max; - low = te - i; high = te + i; - for (i = 0; i < n_b; ++i) { - int e = (int32_t)b[i]; - if ((e < low || e > high) && (int)(b[i]>>32) > r.score2) - r.score2 = b[i]>>32, r.te2 = e; - } - } - } - free(b); - return r; -} - -static inline void revseq(int l, uint8_t *s) -{ - int i, t; - for (i = 0; i < l>>1; ++i) - t = s[i], s[i] = s[l - 1 - i], s[l - 1 - i] = t; -} - -kswr_t ksw_align2(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int xtra, kswq_t **qry) -{ - int size; - kswq_t *q; - kswr_t r, rr; - kswr_t (*func)(kswq_t*, int, const uint8_t*, int, int, int, int, int); - - q = (qry && *qry)? *qry : ksw_qinit((xtra&KSW_XBYTE)? 1 : 2, qlen, query, m, mat); - if (qry && *qry == 0) *qry = q; - func = q->size == 2? ksw_i16 : ksw_u8; - size = q->size; - r = func(q, tlen, target, o_del, e_del, o_ins, e_ins, xtra); - if (qry == 0) free(q); - if ((xtra&KSW_XSTART) == 0 || ((xtra&KSW_XSUBO) && r.score < (xtra&0xffff))) return r; - revseq(r.qe + 1, query); revseq(r.te + 1, target); // +1 because qe/te points to the exact end, not the position after the end - q = ksw_qinit(size, r.qe + 1, query, m, mat); - rr = func(q, tlen, target, o_del, e_del, o_ins, e_ins, KSW_XSTOP | r.score); - revseq(r.qe + 1, query); revseq(r.te + 1, target); - free(q); - if (r.score == rr.score) - r.tb = r.te - rr.te, r.qb = r.qe - rr.qe; - return r; -} - -kswr_t ksw_align(int qlen, uint8_t *query, int tlen, uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int xtra, kswq_t **qry) -{ - return ksw_align2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, xtra, qry); -} - -/******************** - *** SW extension *** - ********************/ - -typedef struct { - int32_t h, e; -} eh_t; - -int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *_qle, int *_tle, int *_gtle, int *_gscore, int *_max_off) -{ - eh_t *eh; // score array - int8_t *qp; // query profile - int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, beg, end, max, max_i, max_j, max_ins, max_del, max_ie, gscore, max_off; - assert(h0 > 0); - // allocate memory - qp = malloc(qlen * m); - eh = calloc(qlen + 1, 8); - // generate the query profile - for (k = i = 0; k < m; ++k) { - const int8_t *p = &mat[k * m]; - for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; - } - // fill the first row - eh[0].h = h0; eh[1].h = h0 > oe_ins? h0 - oe_ins : 0; - for (j = 2; j <= qlen && eh[j-1].h > e_ins; ++j) - eh[j].h = eh[j-1].h - e_ins; - // adjust $w if it is too large - k = m * m; - for (i = 0, max = 0; i < k; ++i) // get the max score - max = max > mat[i]? max : mat[i]; - max_ins = (int)((double)(qlen * max + end_bonus - o_ins) / e_ins + 1.); - max_ins = max_ins > 1? max_ins : 1; - w = w < max_ins? w : max_ins; - max_del = (int)((double)(qlen * max + end_bonus - o_del) / e_del + 1.); - max_del = max_del > 1? max_del : 1; - w = w < max_del? w : max_del; // TODO: is this necessary? - // DP loop - max = h0, max_i = max_j = -1; max_ie = -1, gscore = -1; - max_off = 0; - beg = 0, end = qlen; - for (i = 0; LIKELY(i < tlen); ++i) { - int t, f = 0, h1, m = 0, mj = -1; - int8_t *q = &qp[target[i] * qlen]; - // apply the band and the constraint (if provided) - if (beg < i - w) beg = i - w; - if (end > i + w + 1) end = i + w + 1; - if (end > qlen) end = qlen; - // compute the first column - if (beg == 0) { - h1 = h0 - (o_del + e_del * (i + 1)); - if (h1 < 0) h1 = 0; - } else h1 = 0; - for (j = beg; LIKELY(j < end); ++j) { - // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) - // Similar to SSE2-SW, cells are computed in the following order: - // H(i,j) = max{H(i-1,j-1)+S(i,j), E(i,j), F(i,j)} - // E(i+1,j) = max{H(i,j)-gapo, E(i,j)} - gape - // F(i,j+1) = max{H(i,j)-gapo, F(i,j)} - gape - eh_t *p = &eh[j]; - int h, M = p->h, e = p->e; // get H(i-1,j-1) and E(i-1,j) - p->h = h1; // set H(i,j-1) for the next row - M = M? M + q[j] : 0;// separating H and M to disallow a cigar like "100M3I3D20M" - h = M > e? M : e; // e and f are guaranteed to be non-negative, so h>=0 even if M<0 - h = h > f? h : f; - h1 = h; // save H(i,j) to h1 for the next column - mj = m > h? mj : j; // record the position where max score is achieved - m = m > h? m : h; // m is stored at eh[mj+1] - t = M - oe_del; - t = t > 0? t : 0; - e -= e_del; - e = e > t? e : t; // computed E(i+1,j) - p->e = e; // save E(i+1,j) for the next row - t = M - oe_ins; - t = t > 0? t : 0; - f -= e_ins; - f = f > t? f : t; // computed F(i,j+1) - } - eh[end].h = h1; eh[end].e = 0; - if (j == qlen) { - max_ie = gscore > h1? max_ie : i; - gscore = gscore > h1? gscore : h1; - } - if (m == 0) break; - if (m > max) { - max = m, max_i = i, max_j = mj; - max_off = max_off > abs(mj - i)? max_off : abs(mj - i); - } else if (zdrop > 0) { - if (i - max_i > mj - max_j) { - if (max - m - ((i - max_i) - (mj - max_j)) * e_del > zdrop) break; - } else { - if (max - m - ((mj - max_j) - (i - max_i)) * e_ins > zdrop) break; - } - } - // update beg and end for the next round - for (j = beg; LIKELY(j < end) && eh[j].h == 0 && eh[j].e == 0; ++j); - beg = j; - for (j = end; LIKELY(j >= beg) && eh[j].h == 0 && eh[j].e == 0; --j); - end = j + 2 < qlen? j + 2 : qlen; - //beg = 0; end = qlen; // uncomment this line for debugging - } - free(eh); free(qp); - if (_qle) *_qle = max_j + 1; - if (_tle) *_tle = max_i + 1; - if (_gtle) *_gtle = max_ie + 1; - if (_gscore) *_gscore = gscore; - if (_max_off) *_max_off = max_off; - return max; -} - -int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off) -{ - return ksw_extend2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, end_bonus, zdrop, h0, qle, tle, gtle, gscore, max_off); -} - -/******************** - * Global alignment * - ********************/ - -#define MINUS_INF -0x40000000 - -static inline uint32_t *push_cigar(int *n_cigar, int *m_cigar, uint32_t *cigar, int op, int len) -{ - if (*n_cigar == 0 || op != (cigar[(*n_cigar) - 1]&0xf)) { - if (*n_cigar == *m_cigar) { - *m_cigar = *m_cigar? (*m_cigar)<<1 : 4; - cigar = realloc(cigar, (*m_cigar) << 2); - } - cigar[(*n_cigar)++] = len<<4 | op; - } else cigar[(*n_cigar)-1] += len<<4; - return cigar; -} - -int ksw_global2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int *n_cigar_, uint32_t **cigar_) -{ - eh_t *eh; - int8_t *qp; // query profile - int i, j, k, oe_del = o_del + e_del, oe_ins = o_ins + e_ins, score, n_col; - uint8_t *z; // backtrack matrix; in each cell: f<<4|e<<2|h; in principle, we can halve the memory, but backtrack will be a little more complex - if (n_cigar_) *n_cigar_ = 0; - // allocate memory - n_col = qlen < 2*w+1? qlen : 2*w+1; // maximum #columns of the backtrack matrix - z = n_cigar_ && cigar_? malloc((long)n_col * tlen) : 0; - qp = malloc(qlen * m); - eh = calloc(qlen + 1, 8); - // generate the query profile - for (k = i = 0; k < m; ++k) { - const int8_t *p = &mat[k * m]; - for (j = 0; j < qlen; ++j) qp[i++] = p[query[j]]; - } - // fill the first row - eh[0].h = 0; eh[0].e = MINUS_INF; - for (j = 1; j <= qlen && j <= w; ++j) - eh[j].h = -(o_ins + e_ins * j), eh[j].e = MINUS_INF; - for (; j <= qlen; ++j) eh[j].h = eh[j].e = MINUS_INF; // everything is -inf outside the band - // DP loop - for (i = 0; LIKELY(i < tlen); ++i) { // target sequence is in the outer loop - int32_t f = MINUS_INF, h1, beg, end, t; - int8_t *q = &qp[target[i] * qlen]; - beg = i > w? i - w : 0; - end = i + w + 1 < qlen? i + w + 1 : qlen; // only loop through [beg,end) of the query sequence - h1 = beg == 0? -(o_del + e_del * (i + 1)) : MINUS_INF; - if (n_cigar_ && cigar_) { - uint8_t *zi = &z[(long)i * n_col]; - for (j = beg; LIKELY(j < end); ++j) { - // At the beginning of the loop: eh[j] = { H(i-1,j-1), E(i,j) }, f = F(i,j) and h1 = H(i,j-1) - // Cells are computed in the following order: - // M(i,j) = H(i-1,j-1) + S(i,j) - // H(i,j) = max{M(i,j), E(i,j), F(i,j)} - // E(i+1,j) = max{M(i,j)-gapo, E(i,j)} - gape - // F(i,j+1) = max{M(i,j)-gapo, F(i,j)} - gape - // We have to separate M(i,j); otherwise the direction may not be recorded correctly. - // However, a CIGAR like "10M3I3D10M" allowed by local() is disallowed by global(). - // Such a CIGAR may occur, in theory, if mismatch_penalty > 2*gap_ext_penalty + 2*gap_open_penalty/k. - // In practice, this should happen very rarely given a reasonable scoring system. - eh_t *p = &eh[j]; - int32_t h, m = p->h, e = p->e; - uint8_t d; // direction - p->h = h1; - m += q[j]; - d = m >= e? 0 : 1; - h = m >= e? m : e; - d = h >= f? d : 2; - h = h >= f? h : f; - h1 = h; - t = m - oe_del; - e -= e_del; - d |= e > t? 1<<2 : 0; - e = e > t? e : t; - p->e = e; - t = m - oe_ins; - f -= e_ins; - d |= f > t? 2<<4 : 0; // if we want to halve the memory, use one bit only, instead of two - f = f > t? f : t; - zi[j - beg] = d; // z[i,j] keeps h for the current cell and e/f for the next cell - } - } else { - for (j = beg; LIKELY(j < end); ++j) { - eh_t *p = &eh[j]; - int32_t h, m = p->h, e = p->e; - p->h = h1; - m += q[j]; - h = m >= e? m : e; - h = h >= f? h : f; - h1 = h; - t = m - oe_del; - e -= e_del; - e = e > t? e : t; - p->e = e; - t = m - oe_ins; - f -= e_ins; - f = f > t? f : t; - } - } - eh[end].h = h1; eh[end].e = MINUS_INF; - } - score = eh[qlen].h; - if (n_cigar_ && cigar_) { // backtrack - int n_cigar = 0, m_cigar = 0, which = 0; - uint32_t *cigar = 0, tmp; - i = tlen - 1; k = (i + w + 1 < qlen? i + w + 1 : qlen) - 1; // (i,k) points to the last cell - while (i >= 0 && k >= 0) { - which = z[(long)i * n_col + (k - (i > w? i - w : 0))] >> (which<<1) & 3; - if (which == 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 0, 1), --i, --k; - else if (which == 1) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, 1), --i; - else cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, 1), --k; - } - if (i >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 2, i + 1); - if (k >= 0) cigar = push_cigar(&n_cigar, &m_cigar, cigar, 1, k + 1); - for (i = 0; i < n_cigar>>1; ++i) // reverse CIGAR - tmp = cigar[i], cigar[i] = cigar[n_cigar-1-i], cigar[n_cigar-1-i] = tmp; - *n_cigar_ = n_cigar, *cigar_ = cigar; - } - free(eh); free(qp); free(z); - return score; -} - -int ksw_global(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int *n_cigar_, uint32_t **cigar_) -{ - return ksw_global2(qlen, query, tlen, target, m, mat, gapo, gape, gapo, gape, w, n_cigar_, cigar_); -} - -/******************************************* - * Main function (not compiled by default) * - *******************************************/ - -#ifdef _KSW_MAIN - -#include -#include -#include -#include "kseq.h" -KSEQ_INIT(gzFile, err_gzread) - -unsigned char seq_nt4_table[256] = { - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 -}; - -int main(int argc, char *argv[]) -{ - int c, sa = 1, sb = 3, i, j, k, forward_only = 0, max_rseq = 0; - int8_t mat[25]; - int gapo = 5, gape = 2, minsc = 0, xtra = KSW_XSTART; - uint8_t *rseq = 0; - gzFile fpt, fpq; - kseq_t *kst, *ksq; - - // parse command line - while ((c = getopt(argc, argv, "a:b:q:r:ft:1")) >= 0) { - switch (c) { - case 'a': sa = atoi(optarg); break; - case 'b': sb = atoi(optarg); break; - case 'q': gapo = atoi(optarg); break; - case 'r': gape = atoi(optarg); break; - case 't': minsc = atoi(optarg); break; - case 'f': forward_only = 1; break; - case '1': xtra |= KSW_XBYTE; break; - } - } - if (optind + 2 > argc) { - fprintf(stderr, "Usage: ksw [-1] [-f] [-a%d] [-b%d] [-q%d] [-r%d] [-t%d] \n", sa, sb, gapo, gape, minsc); - return 1; - } - if (minsc > 0xffff) minsc = 0xffff; - xtra |= KSW_XSUBO | minsc; - // initialize scoring matrix - for (i = k = 0; i < 4; ++i) { - for (j = 0; j < 4; ++j) - mat[k++] = i == j? sa : -sb; - mat[k++] = 0; // ambiguous base - } - for (j = 0; j < 5; ++j) mat[k++] = 0; - // open file - fpt = xzopen(argv[optind], "r"); kst = kseq_init(fpt); - fpq = xzopen(argv[optind+1], "r"); ksq = kseq_init(fpq); - // all-pair alignment - while (kseq_read(ksq) > 0) { - kswq_t *q[2] = {0, 0}; - kswr_t r; - for (i = 0; i < (int)ksq->seq.l; ++i) ksq->seq.s[i] = seq_nt4_table[(int)ksq->seq.s[i]]; - if (!forward_only) { // reverse - if ((int)ksq->seq.m > max_rseq) { - max_rseq = ksq->seq.m; - rseq = (uint8_t*)realloc(rseq, max_rseq); - } - for (i = 0, j = ksq->seq.l - 1; i < (int)ksq->seq.l; ++i, --j) - rseq[j] = ksq->seq.s[i] == 4? 4 : 3 - ksq->seq.s[i]; - } - gzrewind(fpt); kseq_rewind(kst); - while (kseq_read(kst) > 0) { - for (i = 0; i < (int)kst->seq.l; ++i) kst->seq.s[i] = seq_nt4_table[(int)kst->seq.s[i]]; - r = ksw_align(ksq->seq.l, (uint8_t*)ksq->seq.s, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[0]); - if (r.score >= minsc) - err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, r.qb, r.qe+1, r.score, r.score2, r.te2); - if (rseq) { - r = ksw_align(ksq->seq.l, rseq, kst->seq.l, (uint8_t*)kst->seq.s, 5, mat, gapo, gape, xtra, &q[1]); - if (r.score >= minsc) - err_printf("%s\t%d\t%d\t%s\t%d\t%d\t%d\t%d\t%d\n", kst->name.s, r.tb, r.te+1, ksq->name.s, (int)ksq->seq.l - r.qb, (int)ksq->seq.l - 1 - r.qe, r.score, r.score2, r.te2); - } - } - free(q[0]); free(q[1]); - } - free(rseq); - kseq_destroy(kst); err_gzclose(fpt); - kseq_destroy(ksq); err_gzclose(fpq); - return 0; -} -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/ksw.h --- a/bwa-0.7.9a/ksw.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,114 +0,0 @@ -#ifndef __AC_KSW_H -#define __AC_KSW_H - -#include - -#define KSW_XBYTE 0x10000 -#define KSW_XSTOP 0x20000 -#define KSW_XSUBO 0x40000 -#define KSW_XSTART 0x80000 - -struct _kswq_t; -typedef struct _kswq_t kswq_t; - -typedef struct { - int score; // best score - int te, qe; // target end and query end - int score2, te2; // second best score and ending position on the target - int tb, qb; // target start and query start -} kswr_t; - -#ifdef __cplusplus -extern "C" { -#endif - - /** - * Aligning two sequences - * - * @param qlen length of the query sequence (typically =0, *gscore keeps the best score such that - * the entire query sequence is aligned; *gtle keeps the position on the - * target where *gscore is achieved. Returning *gscore and *gtle helps the - * caller to decide whether an end-to-end hit or a partial hit is preferred. - * - * The first 9 parameters are identical to those in ksw_global() - * - * @param h0 alignment score of upstream sequences - * @param _qle (out) length of the query in the alignment - * @param _tle (out) length of the target in the alignment - * @param _gtle (out) length of the target if query is fully aligned - * @param _gscore (out) score of the best end-to-end alignment; negative if not found - * - * @return best semi-local alignment score - */ - int ksw_extend(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int gapo, int gape, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); - int ksw_extend2(int qlen, const uint8_t *query, int tlen, const uint8_t *target, int m, const int8_t *mat, int o_del, int e_del, int o_ins, int e_ins, int w, int end_bonus, int zdrop, int h0, int *qle, int *tle, int *gtle, int *gscore, int *max_off); - -#ifdef __cplusplus -} -#endif - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/kthread.c --- a/bwa-0.7.9a/kthread.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -#include -#include - -struct kt_for_t; - -typedef struct { - struct kt_for_t *t; - int i; -} ktf_worker_t; - -typedef struct kt_for_t { - int n_threads, n; - ktf_worker_t *w; - void (*func)(void*,int,int); - void *data; -} kt_for_t; - -static inline int steal_work(kt_for_t *t) -{ - int i, k, min = 0x7fffffff, min_i = -1; - for (i = 0; i < t->n_threads; ++i) - if (min > t->w[i].i) min = t->w[i].i, min_i = i; - k = __sync_fetch_and_add(&t->w[min_i].i, t->n_threads); - return k >= t->n? -1 : k; -} - -static void *ktf_worker(void *data) -{ - ktf_worker_t *w = (ktf_worker_t*)data; - int i; - for (;;) { - i = __sync_fetch_and_add(&w->i, w->t->n_threads); - if (i >= w->t->n) break; - w->t->func(w->t->data, i, w - w->t->w); - } - while ((i = steal_work(w->t)) >= 0) - w->t->func(w->t->data, i, w - w->t->w); - pthread_exit(0); -} - -void kt_for(int n_threads, void (*func)(void*,int,int), void *data, int n) -{ - int i; - kt_for_t t; - pthread_t *tid; - t.func = func, t.data = data, t.n_threads = n_threads, t.n = n; - t.w = (ktf_worker_t*)alloca(n_threads * sizeof(ktf_worker_t)); - tid = (pthread_t*)alloca(n_threads * sizeof(pthread_t)); - for (i = 0; i < n_threads; ++i) - t.w[i].t = &t, t.w[i].i = i; - for (i = 0; i < n_threads; ++i) pthread_create(&tid[i], 0, ktf_worker, &t.w[i]); - for (i = 0; i < n_threads; ++i) pthread_join(tid[i], 0); -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/kvec.h --- a/bwa-0.7.9a/kvec.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,94 +0,0 @@ -/* The MIT License - - Copyright (c) 2008, by Attractive Chaos - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* - An example: - -#include "kvec.h" -int main() { - kvec_t(int) array; - kv_init(array); - kv_push(int, array, 10); // append - kv_a(int, array, 20) = 5; // dynamic - kv_A(array, 20) = 4; // static - kv_destroy(array); - return 0; -} -*/ - -/* - 2008-09-22 (0.1.0): - - * The initial version. - -*/ - -#ifndef AC_KVEC_H -#define AC_KVEC_H - -#include - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#define kv_roundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x)) - -#define kvec_t(type) struct { size_t n, m; type *a; } -#define kv_init(v) ((v).n = (v).m = 0, (v).a = 0) -#define kv_destroy(v) free((v).a) -#define kv_A(v, i) ((v).a[(i)]) -#define kv_pop(v) ((v).a[--(v).n]) -#define kv_size(v) ((v).n) -#define kv_max(v) ((v).m) - -#define kv_resize(type, v, s) ((v).m = (s), (v).a = (type*)realloc((v).a, sizeof(type) * (v).m)) - -#define kv_copy(type, v1, v0) do { \ - if ((v1).m < (v0).n) kv_resize(type, v1, (v0).n); \ - (v1).n = (v0).n; \ - memcpy((v1).a, (v0).a, sizeof(type) * (v0).n); \ - } while (0) \ - -#define kv_push(type, v, x) do { \ - if ((v).n == (v).m) { \ - (v).m = (v).m? (v).m<<1 : 2; \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m); \ - } \ - (v).a[(v).n++] = (x); \ - } while (0) - -#define kv_pushp(type, v) ((((v).n == (v).m)? \ - ((v).m = ((v).m? (v).m<<1 : 2), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : 0), &(v).a[(v).n++]) - -#define kv_a(type, v, i) (((v).m <= (size_t)(i)? \ - ((v).m = (v).n = (i) + 1, kv_roundup32((v).m), \ - (v).a = (type*)realloc((v).a, sizeof(type) * (v).m), 0) \ - : (v).n <= (size_t)(i)? (v).n = (i) + 1 \ - : 0), (v).a[(i)]) - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/main.c --- a/bwa-0.7.9a/main.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,100 +0,0 @@ -#include -#include -#include "kstring.h" -#include "utils.h" - -#ifndef PACKAGE_VERSION -#define PACKAGE_VERSION "0.7.9a-r786" -#endif - -int bwa_fa2pac(int argc, char *argv[]); -int bwa_pac2bwt(int argc, char *argv[]); -int bwa_bwtupdate(int argc, char *argv[]); -int bwa_bwt2sa(int argc, char *argv[]); -int bwa_index(int argc, char *argv[]); -int bwt_bwtgen_main(int argc, char *argv[]); - -int bwa_aln(int argc, char *argv[]); -int bwa_sai2sam_se(int argc, char *argv[]); -int bwa_sai2sam_pe(int argc, char *argv[]); - -int bwa_bwtsw2(int argc, char *argv[]); - -int main_fastmap(int argc, char *argv[]); -int main_mem(int argc, char *argv[]); - -int main_pemerge(int argc, char *argv[]); - -char *bwa_pg; - -static int usage() -{ - fprintf(stderr, "\n"); - fprintf(stderr, "Program: bwa (alignment via Burrows-Wheeler transformation)\n"); - fprintf(stderr, "Version: %s\n", PACKAGE_VERSION); - fprintf(stderr, "Contact: Heng Li \n\n"); - fprintf(stderr, "Usage: bwa [options]\n\n"); - fprintf(stderr, "Command: index index sequences in the FASTA format\n"); - fprintf(stderr, " mem BWA-MEM algorithm\n"); - fprintf(stderr, " fastmap identify super-maximal exact matches\n"); - fprintf(stderr, " pemerge merge overlapping paired ends (EXPERIMENTAL)\n"); - fprintf(stderr, " aln gapped/ungapped alignment\n"); - fprintf(stderr, " samse generate alignment (single ended)\n"); - fprintf(stderr, " sampe generate alignment (paired ended)\n"); - fprintf(stderr, " bwasw BWA-SW for long queries\n"); - fprintf(stderr, "\n"); - fprintf(stderr, " fa2pac convert FASTA to PAC format\n"); - fprintf(stderr, " pac2bwt generate BWT from PAC\n"); - fprintf(stderr, " pac2bwtgen alternative algorithm for generating BWT\n"); - fprintf(stderr, " bwtupdate update .bwt to the new format\n"); - fprintf(stderr, " bwt2sa generate SA from BWT and Occ\n"); - fprintf(stderr, "\n"); - fprintf(stderr, -"Note: To use BWA, you need to first index the genome with `bwa index'.\n" -" There are three alignment algorithms in BWA: `mem', `bwasw', and\n" -" `aln/samse/sampe'. If you are not sure which to use, try `bwa mem'\n" -" first. Please `man ./bwa.1' for the manual.\n\n"); - return 1; -} - -int main(int argc, char *argv[]) -{ - int i, ret; - double t_real; - kstring_t pg = {0,0,0}; - t_real = realtime(); - ksprintf(&pg, "@PG\tID:bwa\tPN:bwa\tVN:%s\tCL:%s", PACKAGE_VERSION, argv[0]); - for (i = 1; i < argc; ++i) ksprintf(&pg, " %s", argv[i]); - bwa_pg = pg.s; - if (argc < 2) return usage(); - if (strcmp(argv[1], "fa2pac") == 0) ret = bwa_fa2pac(argc-1, argv+1); - else if (strcmp(argv[1], "pac2bwt") == 0) ret = bwa_pac2bwt(argc-1, argv+1); - else if (strcmp(argv[1], "pac2bwtgen") == 0) ret = bwt_bwtgen_main(argc-1, argv+1); - else if (strcmp(argv[1], "bwtupdate") == 0) ret = bwa_bwtupdate(argc-1, argv+1); - else if (strcmp(argv[1], "bwt2sa") == 0) ret = bwa_bwt2sa(argc-1, argv+1); - else if (strcmp(argv[1], "index") == 0) ret = bwa_index(argc-1, argv+1); - else if (strcmp(argv[1], "aln") == 0) ret = bwa_aln(argc-1, argv+1); - else if (strcmp(argv[1], "samse") == 0) ret = bwa_sai2sam_se(argc-1, argv+1); - else if (strcmp(argv[1], "sampe") == 0) ret = bwa_sai2sam_pe(argc-1, argv+1); - else if (strcmp(argv[1], "bwtsw2") == 0) ret = bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "dbwtsw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "bwasw") == 0) ret = bwa_bwtsw2(argc-1, argv+1); - else if (strcmp(argv[1], "fastmap") == 0) ret = main_fastmap(argc-1, argv+1); - else if (strcmp(argv[1], "mem") == 0) ret = main_mem(argc-1, argv+1); - else if (strcmp(argv[1], "pemerge") == 0) ret = main_pemerge(argc-1, argv+1); - else { - fprintf(stderr, "[main] unrecognized command '%s'\n", argv[1]); - return 1; - } - err_fflush(stdout); - err_fclose(stdout); - if (ret == 0) { - fprintf(stderr, "[%s] Version: %s\n", __func__, PACKAGE_VERSION); - fprintf(stderr, "[%s] CMD:", __func__); - for (i = 0; i < argc; ++i) - fprintf(stderr, " %s", argv[i]); - fprintf(stderr, "\n[%s] Real time: %.3f sec; CPU: %.3f sec\n", __func__, realtime() - t_real, cputime()); - } - free(bwa_pg); - return ret; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/malloc_wrap.c --- a/bwa-0.7.9a/malloc_wrap.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,57 +0,0 @@ -#include -#include -#include -#include -#ifdef USE_MALLOC_WRAPPERS -/* Don't wrap ourselves */ -# undef USE_MALLOC_WRAPPERS -#endif -#include "malloc_wrap.h" - -void *wrap_calloc(size_t nmemb, size_t size, - const char *file, unsigned int line, const char *func) { - void *p = calloc(nmemb, size); - if (NULL == p) { - fprintf(stderr, - "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", - func, nmemb * size, file, line, strerror(errno)); - exit(EXIT_FAILURE); - } - return p; -} - -void *wrap_malloc(size_t size, - const char *file, unsigned int line, const char *func) { - void *p = malloc(size); - if (NULL == p) { - fprintf(stderr, - "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", - func, size, file, line, strerror(errno)); - exit(EXIT_FAILURE); - } - return p; -} - -void *wrap_realloc(void *ptr, size_t size, - const char *file, unsigned int line, const char *func) { - void *p = realloc(ptr, size); - if (NULL == p) { - fprintf(stderr, - "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", - func, size, file, line, strerror(errno)); - exit(EXIT_FAILURE); - } - return p; -} - -char *wrap_strdup(const char *s, - const char *file, unsigned int line, const char *func) { - char *p = strdup(s); - if (NULL == p) { - fprintf(stderr, - "[%s] Failed to allocate %zd bytes at %s line %u: %s\n", - func, strlen(s), file, line, strerror(errno)); - exit(EXIT_FAILURE); - } - return p; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/malloc_wrap.h --- a/bwa-0.7.9a/malloc_wrap.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -#ifndef MALLOC_WRAP_H -#define MALLOC_WRAP_H - -#include /* Avoid breaking the usual definitions */ -#include - -#ifdef __cplusplus -extern "C" { -#endif - - void *wrap_calloc(size_t nmemb, size_t size, - const char *file, unsigned int line, const char *func); - void *wrap_malloc(size_t size, - const char *file, unsigned int line, const char *func); - void *wrap_realloc(void *ptr, size_t size, - const char *file, unsigned int line, const char *func); - char *wrap_strdup(const char *s, - const char *file, unsigned int line, const char *func); - -#ifdef __cplusplus -} -#endif - -#ifdef USE_MALLOC_WRAPPERS -# ifdef calloc -# undef calloc -# endif -# define calloc(n, s) wrap_calloc( (n), (s), __FILE__, __LINE__, __func__) - -# ifdef malloc -# undef malloc -# endif -# define malloc(s) wrap_malloc( (s), __FILE__, __LINE__, __func__) - -# ifdef realloc -# undef realloc -# endif -# define realloc(p, s) wrap_realloc((p), (s), __FILE__, __LINE__, __func__) - -# ifdef strdup -# undef strdup -# endif -# define strdup(s) wrap_strdup( (s), __FILE__, __LINE__, __func__) - -#endif /* USE_MALLOC_WRAPPERS */ - -#endif /* MALLOC_WRAP_H */ diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/pemerge.c --- a/bwa-0.7.9a/pemerge.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,291 +0,0 @@ -#include -#include -#include -#include -#include -#include -#include -#include "ksw.h" -#include "kseq.h" -#include "kstring.h" -#include "bwa.h" -#include "utils.h" -KSEQ_DECLARE(gzFile) - -#ifdef USE_MALLOC_WRAPPERS -# include "malloc_wrap.h" -#endif - -#define MAX_SCORE_RATIO 0.9f -#define MAX_ERR 8 - -static const char *err_msg[MAX_ERR+1] = { - "successful merges", - "low-scoring pairs", - "pairs where the best SW alignment is not an overlap (long left end)", - "pairs where the best SW alignment is not an overlap (long right end)", - "pairs with large 2nd best SW score", - "pairs with gapped overlap", - "pairs where the end-to-end alignment is inconsistent with SW", - "pairs potentially with tandem overlaps", - "pairs with high sum of errors" -}; - -typedef struct { - int a, b, q, r, w; - int q_def, q_thres; - int T; - int chunk_size; - int n_threads; - int flag; // bit 1: print merged; 2: print unmerged - int8_t mat[25]; -} pem_opt_t; - -pem_opt_t *pem_opt_init() -{ - pem_opt_t *opt; - opt = calloc(1, sizeof(pem_opt_t)); - opt->a = 5; opt->b = 4; opt->q = 2, opt->r = 17; opt->w = 20; - opt->T = opt->a * 10; - opt->q_def = 20; - opt->q_thres = 70; - opt->chunk_size = 10000000; - opt->n_threads = 1; - opt->flag = 3; - bwa_fill_scmat(opt->a, opt->b, opt->mat); - return opt; -} - -int bwa_pemerge(const pem_opt_t *opt, bseq1_t x[2]) -{ - uint8_t *s[2], *q[2], *seq, *qual; - int i, xtra, l, l_seq, sum_q, ret = 0; - kswr_t r; - - s[0] = malloc(x[0].l_seq); q[0] = malloc(x[0].l_seq); - s[1] = malloc(x[1].l_seq); q[1] = malloc(x[1].l_seq); - for (i = 0; i < x[0].l_seq; ++i) { - int c = x[0].seq[i]; - s[0][i] = c < 0 || c > 127? 4 : c <= 4? c : nst_nt4_table[c]; - q[0][i] = x[0].qual? x[0].qual[i] - 33 : opt->q_def; - } - for (i = 0; i < x[1].l_seq; ++i) { - int c = x[1].seq[x[1].l_seq - 1 - i]; - c = c < 0 || c > 127? 4 : c < 4? c : nst_nt4_table[c]; - s[1][i] = c < 4? 3 - c : 4; - q[1][i] = x[1].qual? x[1].qual[x[1].l_seq - 1 - i] - 33 : opt->q_def; - } - - xtra = KSW_XSTART | KSW_XSUBO; - r = ksw_align(x[1].l_seq, s[1], x[0].l_seq, s[0], 5, opt->mat, opt->q, opt->r, xtra, 0); - ++r.qe; ++r.te; // change to the half-close-half-open coordinates - - if (r.score < opt->T) { ret = -1; goto pem_ret; } // poor alignment - if (r.tb < r.qb) { ret = -2; goto pem_ret; } // no enough space for the left end - if (x[0].l_seq - r.te > x[1].l_seq - r.qe) { ret = -3; goto pem_ret; } // no enough space for the right end - if ((double)r.score2 / r.score >= MAX_SCORE_RATIO) { ret = -4; goto pem_ret; } // the second best score is too large - if (r.qe - r.qb != r.te - r.tb) { ret = -5; goto pem_ret; } // we do not allow gaps - - { // test tandem match; O(n^2) - int max_m, max_m2, min_l, max_l, max_l2; - max_m = max_m2 = 0; max_l = max_l2 = 0; - min_l = x[0].l_seq < x[1].l_seq? x[0].l_seq : x[1].l_seq; - for (l = 1; l < min_l; ++l) { - int m = 0, o = x[0].l_seq - l; - uint8_t *s0o = &s[0][o], *s1 = s[1]; - for (i = 0; i < l; ++i) // TODO: in principle, this can be done with SSE2. It is the bottleneck! - m += opt->mat[(s1[i]<<2) + s1[i] + s0o[i]]; // equivalent to s[1][i]*5 + s[0][o+i] - if (m > max_m) max_m2 = max_m, max_m = m, max_l2 = max_l, max_l = l; - else if (m > max_m2) max_m2 = m, max_l2 = l; - } - if (max_m < opt->T || max_l != x[0].l_seq - (r.tb - r.qb)) { ret = -6; goto pem_ret; } - if (max_l2 < max_l && max_m2 >= opt->T && (double)(max_m2 + (max_l - max_l2) * opt->a) / max_m >= MAX_SCORE_RATIO) { - ret = -7; goto pem_ret; - } - if (max_l2 > max_l && (double)max_m2 / max_m >= MAX_SCORE_RATIO) { ret = -7; goto pem_ret; } - } - - l = x[0].l_seq - (r.tb - r.qb); // length to merge - l_seq = x[0].l_seq + x[1].l_seq - l; - seq = malloc(l_seq + 1); - qual = malloc(l_seq + 1); - memcpy(seq, s[0], x[0].l_seq); memcpy(seq + x[0].l_seq, &s[1][l], x[1].l_seq - l); - memcpy(qual, q[0], x[0].l_seq); memcpy(qual + x[0].l_seq, &q[1][l], x[1].l_seq - l); - for (i = 0, sum_q = 0; i < l; ++i) { - int k = x[0].l_seq - l + i; - if (s[0][k] == 4) { // ambiguous - seq[k] = s[1][i]; - qual[k] = q[1][i]; - } else if (s[1][i] == 4) { // do nothing - } else if (s[0][k] == s[1][i]) { - qual[k] = qual[k] > q[1][i]? qual[k] : q[1][i]; - } else { // s[0][k] != s[1][i] and neither is N - int qq = q[0][k] < q[1][i]? q[0][k] : q[1][i]; - sum_q += qq >= 3? qq<<1 : 1; - seq[k] = q[0][k] > q[1][i]? s[0][k] : s[1][i]; - qual[k] = abs((int)q[0][k] - (int)q[1][i]); - } - } - if (sum_q>>1 > opt->q_thres) { // too many mismatches - free(seq); free(qual); - ret = -8; goto pem_ret; - } - - for (i = 0; i < l_seq; ++i) seq[i] = "ACGTN"[(int)seq[i]], qual[i] += 33; - seq[l_seq] = qual[l_seq] = 0; - - free(x[1].name); free(x[1].seq); free(x[1].qual); free(x[1].comment); - memset(&x[1], 0, sizeof(bseq1_t)); - free(x[0].seq); free(x[0].qual); - x[0].l_seq = l_seq; x[0].seq = (char*)seq; x[0].qual = (char*)qual; - -pem_ret: - free(s[0]); free(s[1]); free(q[0]); free(q[1]); - return ret; -} - -static inline void print_bseq(const bseq1_t *s, int rn) -{ - err_putchar(s->qual? '@' : '>'); - err_fputs(s->name, stdout); - if (rn == 1 || rn == 2) { - err_putchar('/'); err_putchar('0' + rn); err_putchar('\n'); - } else err_puts(" merged"); - err_puts(s->seq); - if (s->qual) { - err_puts("+"); err_puts(s->qual); - } -} - -typedef struct { - int n, start; - bseq1_t *seqs; - int64_t cnt[MAX_ERR+1]; - const pem_opt_t *opt; -} worker_t; - -void *worker(void *data) -{ - worker_t *w = (worker_t*)data; - int i; - for (i = w->start; i < w->n>>1; i += w->opt->n_threads) - ++w->cnt[-bwa_pemerge(w->opt, &w->seqs[i<<1])]; - return 0; -} - -static void process_seqs(const pem_opt_t *opt, int n_, bseq1_t *seqs, int64_t cnt[MAX_ERR+1]) -{ - int i, j, n = n_>>1<<1; - worker_t *w; - - w = calloc(opt->n_threads, sizeof(worker_t)); - for (i = 0; i < opt->n_threads; ++i) { - worker_t *p = &w[i]; - p->start = i; p->n = n; - p->opt = opt; - p->seqs = seqs; - } - if (opt->n_threads == 1) { - worker(w); - } else { - pthread_t *tid; - tid = (pthread_t*)calloc(opt->n_threads, sizeof(pthread_t)); - for (i = 0; i < opt->n_threads; ++i) pthread_create(&tid[i], 0, worker, &w[i]); - for (i = 0; i < opt->n_threads; ++i) pthread_join(tid[i], 0); - free(tid); - } - for (i = 0; i < opt->n_threads; ++i) { - worker_t *p = &w[i]; - for (j = 0; j <= MAX_ERR; ++j) cnt[j] += p->cnt[j]; - } - free(w); - for (i = 0; i < n>>1; ++i) { - if (seqs[i<<1|1].l_seq != 0) { - if (opt->flag&2) { - print_bseq(&seqs[i<<1|0], 1); - print_bseq(&seqs[i<<1|1], 2); - } - } else if (opt->flag&1) - print_bseq(&seqs[i<<1|0], 0); - } - for (i = 0; i < n; ++i) { - bseq1_t *s = &seqs[i]; - free(s->name); free(s->seq); free(s->qual); free(s->comment); - } -} - -int main_pemerge(int argc, char *argv[]) -{ - int c, flag = 0, i, n, min_ovlp = 10; - int64_t cnt[MAX_ERR+1]; - bseq1_t *bseq; - gzFile fp, fp2 = 0; - kseq_t *ks, *ks2 = 0; - pem_opt_t *opt; - - opt = pem_opt_init(); - while ((c = getopt(argc, argv, "muQ:t:T:")) >= 0) { - if (c == 'm') flag |= 1; - else if (c == 'u') flag |= 2; - else if (c == 'Q') opt->q_thres = atoi(optarg); - else if (c == 't') opt->n_threads = atoi(optarg); - else if (c == 'T') min_ovlp = atoi(optarg); - else return 1; - } - if (flag == 0) flag = 3; - opt->flag = flag; - opt->T = opt->a * min_ovlp; - - if (optind == argc) { - fprintf(stderr, "\n"); - fprintf(stderr, "Usage: bwa pemerge [-mu] [read2.fq]\n\n"); - fprintf(stderr, "Options: -m output merged reads only\n"); - fprintf(stderr, " -u output unmerged reads only\n"); - fprintf(stderr, " -t INT number of threads [%d]\n", opt->n_threads); - fprintf(stderr, " -T INT minimum end overlap [%d]\n", min_ovlp); - fprintf(stderr, " -Q INT max sum of errors [%d]\n", opt->q_thres); - fprintf(stderr, "\n"); - free(opt); - return 1; - } - - fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); - if (NULL == fp) { - fprintf(stderr, "Couldn't open %s : %s\n", - strcmp(argv[optind], "-") ? argv[optind] : "stdin", - errno ? strerror(errno) : "Out of memory"); - exit(EXIT_FAILURE); - } - ks = kseq_init(fp); - if (optind + 1 < argc) { - fp2 = strcmp(argv[optind+1], "-")? gzopen(argv[optind+1], "r") : gzdopen(fileno(stdin), "r"); - if (NULL == fp) { - fprintf(stderr, "Couldn't open %s : %s\n", - strcmp(argv[optind+1], "-") ? argv[optind+1] : "stdin", - errno ? strerror(errno) : "Out of memory"); - exit(EXIT_FAILURE); - } - ks2 = kseq_init(fp2); - } - - memset(cnt, 0, 8 * (MAX_ERR+1)); - while ((bseq = bseq_read(opt->n_threads * opt->chunk_size, &n, ks, ks2)) != 0) { - process_seqs(opt, n, bseq, cnt); - free(bseq); - } - - fprintf(stderr, "%12ld %s\n", (long)cnt[0], err_msg[0]); - for (i = 1; i <= MAX_ERR; ++i) - fprintf(stderr, "%12ld %s\n", (long)cnt[i], err_msg[i]); - kseq_destroy(ks); - err_gzclose(fp); - if (ks2) { - kseq_destroy(ks2); - err_gzclose(fp2); - } - free(opt); - - err_fflush(stdout); - - return 0; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/qualfa2fq.pl --- a/bwa-0.7.9a/qualfa2fq.pl Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,27 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use warnings; - -die("Usage: qualfa2fq.pl \n") if (@ARGV != 2); - -my ($fhs, $fhq, $q); -open($fhs, ($ARGV[0] =~ /\.gz$/)? "gzip -dc $ARGV[0] |" : $ARGV[0]) || die; -open($fhq, ($ARGV[1] =~ /\.gz$/)? "gzip -dc $ARGV[1] |" : $ARGV[1]) || die; - -$/ = ">"; <$fhs>; <$fhq>; $/ = "\n"; -while (<$fhs>) { - $q = <$fhq>; - print "\@$_"; - $/ = ">"; - $_ = <$fhs>; $q = <$fhq>; - chomp; chomp($q); - $q =~ s/\s*(\d+)\s*/chr($1+33)/eg; - print $_, "+\n"; - for (my $i = 0; $i < length($q); $i += 60) { - print substr($q, $i, 60), "\n"; - } - $/ = "\n"; -} - -close($fhs); close($fhq); diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/utils.c --- a/bwa-0.7.9a/utils.c Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,284 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ -#define FSYNC_ON_FLUSH - -#include -#include -#include -#include -#include -#include -#ifdef FSYNC_ON_FLUSH -#include -#include -#include -#endif -#include -#include -#include "utils.h" - -#include "ksort.h" -#define pair64_lt(a, b) ((a).x < (b).x || ((a).x == (b).x && (a).y < (b).y)) -KSORT_INIT(128, pair64_t, pair64_lt) -KSORT_INIT(64, uint64_t, ks_lt_generic) - -#include "kseq.h" -KSEQ_INIT2(, gzFile, err_gzread) - -/******************** - * System utilities * - ********************/ - -FILE *err_xopen_core(const char *func, const char *fn, const char *mode) -{ - FILE *fp = 0; - if (strcmp(fn, "-") == 0) - return (strstr(mode, "r"))? stdin : stdout; - if ((fp = fopen(fn, mode)) == 0) { - err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); - } - return fp; -} - -FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp) -{ - if (freopen(fn, mode, fp) == 0) { - err_fatal(func, "fail to open file '%s' : %s", fn, strerror(errno)); - } - return fp; -} - -gzFile err_xzopen_core(const char *func, const char *fn, const char *mode) -{ - gzFile fp; - if (strcmp(fn, "-") == 0) { - fp = gzdopen(fileno((strstr(mode, "r"))? stdin : stdout), mode); - /* According to zlib.h, this is the only reason gzdopen can fail */ - if (!fp) err_fatal(func, "Out of memory"); - return fp; - } - if ((fp = gzopen(fn, mode)) == 0) { - err_fatal(func, "fail to open file '%s' : %s", fn, errno ? strerror(errno) : "Out of memory"); - } - return fp; -} - -void err_fatal(const char *header, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - fprintf(stderr, "[%s] ", header); - vfprintf(stderr, fmt, args); - fprintf(stderr, "\n"); - va_end(args); - exit(EXIT_FAILURE); -} - -void err_fatal_core(const char *header, const char *fmt, ...) -{ - va_list args; - va_start(args, fmt); - fprintf(stderr, "[%s] ", header); - vfprintf(stderr, fmt, args); - fprintf(stderr, " Abort!\n"); - va_end(args); - abort(); -} - -void _err_fatal_simple(const char *func, const char *msg) -{ - fprintf(stderr, "[%s] %s\n", func, msg); - exit(EXIT_FAILURE); -} - -void _err_fatal_simple_core(const char *func, const char *msg) -{ - fprintf(stderr, "[%s] %s Abort!\n", func, msg); - abort(); -} - -size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream) -{ - size_t ret = fwrite(ptr, size, nmemb, stream); - if (ret != nmemb) - _err_fatal_simple("fwrite", strerror(errno)); - return ret; -} - -size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream) -{ - size_t ret = fread(ptr, size, nmemb, stream); - if (ret != nmemb) - { - _err_fatal_simple("fread", ferror(stream) ? strerror(errno) : "Unexpected end of file"); - } - return ret; -} - -int err_gzread(gzFile file, void *ptr, unsigned int len) -{ - int ret = gzread(file, ptr, len); - - if (ret < 0) - { - int errnum = 0; - const char *msg = gzerror(file, &errnum); - _err_fatal_simple("gzread", Z_ERRNO == errnum ? strerror(errno) : msg); - } - - return ret; -} - -int err_fseek(FILE *stream, long offset, int whence) -{ - int ret = fseek(stream, offset, whence); - if (0 != ret) - { - _err_fatal_simple("fseek", strerror(errno)); - } - return ret; -} - -long err_ftell(FILE *stream) -{ - long ret = ftell(stream); - if (-1 == ret) - { - _err_fatal_simple("ftell", strerror(errno)); - } - return ret; -} - -int err_printf(const char *format, ...) -{ - va_list arg; - int done; - va_start(arg, format); - done = vfprintf(stdout, format, arg); - int saveErrno = errno; - va_end(arg); - if (done < 0) _err_fatal_simple("vfprintf(stdout)", strerror(saveErrno)); - return done; -} - -int err_fprintf(FILE *stream, const char *format, ...) -{ - va_list arg; - int done; - va_start(arg, format); - done = vfprintf(stream, format, arg); - int saveErrno = errno; - va_end(arg); - if (done < 0) _err_fatal_simple("vfprintf", strerror(saveErrno)); - return done; -} - -int err_fputc(int c, FILE *stream) -{ - int ret = putc(c, stream); - if (EOF == ret) - { - _err_fatal_simple("fputc", strerror(errno)); - } - - return ret; -} - -int err_fputs(const char *s, FILE *stream) -{ - int ret = fputs(s, stream); - if (EOF == ret) - { - _err_fatal_simple("fputs", strerror(errno)); - } - - return ret; -} - -int err_fflush(FILE *stream) -{ - int ret = fflush(stream); - if (ret != 0) _err_fatal_simple("fflush", strerror(errno)); - -#ifdef FSYNC_ON_FLUSH - /* Calling fflush() ensures that all the data has made it to the - kernel buffers, but this may not be sufficient for remote filesystems - (e.g. NFS, lustre) as an error may still occur while the kernel - is copying the buffered data to the file server. To be sure of - catching these errors, we need to call fsync() on the file - descriptor, but only if it is a regular file. */ - { - struct stat sbuf; - if (0 != fstat(fileno(stream), &sbuf)) - _err_fatal_simple("fstat", strerror(errno)); - - if (S_ISREG(sbuf.st_mode)) - { - if (0 != fsync(fileno(stream))) - _err_fatal_simple("fsync", strerror(errno)); - } - } -#endif - return ret; -} - -int err_fclose(FILE *stream) -{ - int ret = fclose(stream); - if (ret != 0) _err_fatal_simple("fclose", strerror(errno)); - return ret; -} - -int err_gzclose(gzFile file) -{ - int ret = gzclose(file); - if (Z_OK != ret) - { - _err_fatal_simple("gzclose", Z_ERRNO == ret ? strerror(errno) : zError(ret)); - } - - return ret; -} - -/********* - * Timer * - *********/ - -double cputime() -{ - struct rusage r; - getrusage(RUSAGE_SELF, &r); - return r.ru_utime.tv_sec + r.ru_stime.tv_sec + 1e-6 * (r.ru_utime.tv_usec + r.ru_stime.tv_usec); -} - -double realtime() -{ - struct timeval tp; - struct timezone tzp; - gettimeofday(&tp, &tzp); - return tp.tv_sec + tp.tv_usec * 1e-6; -} diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/utils.h --- a/bwa-0.7.9a/utils.h Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,111 +0,0 @@ -/* The MIT License - - Copyright (c) 2008 Genome Research Ltd (GRL). - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. -*/ - -/* Contact: Heng Li */ - -#ifndef LH3_UTILS_H -#define LH3_UTILS_H - -#include -#include -#include - -#ifdef __GNUC__ -// Tell GCC to validate printf format string and args -#define ATTRIBUTE(list) __attribute__ (list) -#else -#define ATTRIBUTE(list) -#endif - -#define err_fatal_simple(msg) _err_fatal_simple(__func__, msg) -#define err_fatal_simple_core(msg) _err_fatal_simple_core(__func__, msg) - -#define xopen(fn, mode) err_xopen_core(__func__, fn, mode) -#define xreopen(fn, mode, fp) err_xreopen_core(__func__, fn, mode, fp) -#define xzopen(fn, mode) err_xzopen_core(__func__, fn, mode) - -#define xassert(cond, msg) if ((cond) == 0) _err_fatal_simple_core(__func__, msg) - -typedef struct { - uint64_t x, y; -} pair64_t; - -typedef struct { size_t n, m; uint64_t *a; } uint64_v; -typedef struct { size_t n, m; pair64_t *a; } pair64_v; - -#ifdef __cplusplus -extern "C" { -#endif - - void err_fatal(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); - void err_fatal_core(const char *header, const char *fmt, ...) ATTRIBUTE((noreturn)); - void _err_fatal_simple(const char *func, const char *msg) ATTRIBUTE((noreturn)); - void _err_fatal_simple_core(const char *func, const char *msg) ATTRIBUTE((noreturn)); - FILE *err_xopen_core(const char *func, const char *fn, const char *mode); - FILE *err_xreopen_core(const char *func, const char *fn, const char *mode, FILE *fp); - gzFile err_xzopen_core(const char *func, const char *fn, const char *mode); - size_t err_fwrite(const void *ptr, size_t size, size_t nmemb, FILE *stream); - size_t err_fread_noeof(void *ptr, size_t size, size_t nmemb, FILE *stream); - - int err_gzread(gzFile file, void *ptr, unsigned int len); - int err_fseek(FILE *stream, long offset, int whence); -#define err_rewind(FP) err_fseek((FP), 0, SEEK_SET) - long err_ftell(FILE *stream); - int err_fprintf(FILE *stream, const char *format, ...) - ATTRIBUTE((format(printf, 2, 3))); - int err_printf(const char *format, ...) - ATTRIBUTE((format(printf, 1, 2))); - int err_fputc(int c, FILE *stream); -#define err_putchar(C) err_fputc((C), stdout) - int err_fputs(const char *s, FILE *stream); -#define err_puts(S) err_fputs((S), stdout) - int err_fflush(FILE *stream); - int err_fclose(FILE *stream); - int err_gzclose(gzFile file); - - double cputime(); - double realtime(); - - void ks_introsort_64 (size_t n, uint64_t *a); - void ks_introsort_128(size_t n, pair64_t *a); - -#ifdef __cplusplus -} -#endif - -static inline uint64_t hash_64(uint64_t key) -{ - key += ~(key << 32); - key ^= (key >> 22); - key += ~(key << 13); - key ^= (key >> 8); - key += (key << 3); - key ^= (key >> 15); - key += ~(key << 27); - key ^= (key >> 31); - return key; -} - -#endif diff -r ce5a8082bbb8 -r abdbc8fe98dd bwa-0.7.9a/xa2multi.pl --- a/bwa-0.7.9a/xa2multi.pl Thu Aug 14 02:16:48 2014 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,25 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use warnings; - -while (<>) { - if (/\tXA:Z:(\S+)/) { - my $l = $1; - print; - my @t = split("\t"); - while ($l =~ /([^,;]+),([-+]\d+),([^,]+),(\d+);/g) { - my $mchr = ($t[6] eq $1)? '=' : $t[6]; # FIXME: TLEN/ISIZE is not calculated! - my $seq = $t[9]; - my $phred = $t[10]; - # if alternative alignment has other orientation than primary, - # then print the reverse (complement) of sequence and phred string - if ((($t[1]&0x10)>0) xor ($2<0)) { - $seq = reverse $seq; - $seq =~ tr/ACGTacgt/TGCAtgca/; - $phred = reverse $phred; - } - print(join("\t", $t[0], 0x100|($t[1]&0x6e9)|($2<0?0x10:0), $1, abs($2), 0, $3, @t[6..7], 0, $seq, $phred, "NM:i:$4"), "\n"); - } - } else { print; } -}